## Loading required package: digest
## Loading required package: tibble
## Loading required package: ggplot2
## Project name: 01.protein-seq-evo-v1
## Loading project configuration
## Autoloading packages
##  Loading package: dplyr
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
##  Loading package: stringr
## Loading required package: stringr
##  Loading package: readr
## Loading required package: readr
##  Loading package: ggplot2
##  Loading package: tidyr
## Loading required package: tidyr
##  Loading package: patchwork
## Loading required package: patchwork
##  Loading package: gridExtra
## Loading required package: gridExtra
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
##  Loading package: GGally
## Loading required package: GGally
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
##  Loading package: readxl
## Loading required package: readxl
##  Loading package: viridis
## Loading required package: viridis
## Loading required package: viridisLite
##  Loading package: cowplot
## Loading required package: cowplot
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:patchwork':
## 
##     align_plots
##  Loading package: ggsignif
## Loading required package: ggsignif
##  Loading package: minpack.lm
## Loading required package: minpack.lm
##  Loading package: purrr
## Loading required package: purrr
##  Loading package: scales
## Loading required package: scales
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:viridis':
## 
##     viridis_pal
## The following object is masked from 'package:readr':
## 
##     col_factor
##  Loading package: bigsnpr
## Loading required package: bigsnpr
## Loading required package: bigstatsr
##  Loading package: drc
## Loading required package: drc
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:patchwork':
## 
##     area
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## 'drc' has been loaded.
## Please cite R and 'drc' if used for a publication,
## for references type 'citation()' and 'citation('drc')'.
## 
## Attaching package: 'drc'
## The following objects are masked from 'package:stats':
## 
##     gaussian, getInitial
##  Loading package: data.table
## Loading required package: data.table
## 
## Attaching package: 'data.table'
## The following object is masked from 'package:purrr':
## 
##     transpose
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## Autoloading helper functions
##  Running helper script: globals.R
##  Running helper script: helpers.R
## Autoloading data
## Munging data
##  Running preprocessing script: 01-util.R
##  Sourcing R script: 01-util.R
##  Running preprocessing script: 02-munge_kras.R
##  Sourcing R script: 02-munge_kras.R
## Rows: 189 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): SEQ, ATOM, COLOR, CONFIDENCE INTERVAL, MSA DATA, RESIDUE VARIETY
## dbl (2): POS, SCORE
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 3591 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): X1, X2, X4
## dbl (1): X3
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 3780 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): variant
## dbl (2): score, pos
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 3572 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): mutant
## dbl (11): column_coverage, popEVE, pop-adjusted_ESM1v, pop-adjusted_EVH_epis...
## lgl  (3): pop-adjusted_EVE, pop-adjusted_Tranception, EVE
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 3780 Columns: 43
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (18): wt_aa, mt_aa, ClinVar_ClinicalSignificance, Starred_Coarse_Grained...
## dbl (11): position, Gold_Stars, NumberSubmitters, frequency_gv2, frequency_g...
## lgl (14): BS1, PM2, PM5, PP5, BP6, b_model, p_model, b_acmg_model, lb_acmg_m...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 2587 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): id, wt_aa, mt_aa, bp_interface, binding_RAF, binding_RAL, binding_...
## dbl (20): Pos_real, abundance_ddg, abundance_ddg_std, pik3cg_ddg, pik3cg_ddg...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 3453 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): id, wt_codon
## dbl (4): Pos_real, mean_kcal/mol, std_kcal/mol, ESM1v
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
##  Running preprocessing script: 03-munge_src.R
## 
##  Sourcing R script: 03-munge_src.R
## 
## Rows: 5112 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: " "
## chr (1): id
## dbl (8): FL_activity_mean_kcal/mol, FL_activity_std_kcal/mol, FL_folding_mea...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 536 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): SEQ, ATOM, COLOR, CONFIDENCE INTERVAL, MSA DATA, RESIDUE VARIETY
## dbl (2): POS, SCORE
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 10184 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): X1, X2, X4
## dbl (1): X3
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 10720 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): variant
## dbl (2): score, pos
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 10184 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): mutant
## dbl (14): column_coverage, popEVE, pop-adjusted_EVE, pop-adjusted_ESM1v, pop...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 10184 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): mutant
## dbl (14): column_coverage, popEVE, pop-adjusted_EVE, pop-adjusted_ESM1v, pop...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 536 Columns: 27
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): WT_AA, domains_Pfam, KD_lobe, secondary_structure_uniprot, seconda...
## dbl  (1): position
## lgl (11): block1, block2, block3, block4, block5, R-spine, C-spine, Communit...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4898 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): id, wt_aa
## dbl (9): FL_kinase_fitness_scaled, FL_kinase_sigma_scaled, FL_abundance_fitn...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
##  Running preprocessing script: 04-munge_psd95.R
## 
##  Sourcing R script: 04-munge_psd95.R
## 
## Rows: 3154 Columns: 41
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): id_eve, id_old, trait_name, library, assay, pdz_name, alignment_po...
## dbl (26): X, V1, pos_am, ddg, std_ddg, ci95_kcal.mol, pdz, structural_alignm...
## lgl  (1): binding_interface_5A
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
##  Running preprocessing script: 05-munge-grb2.R
## 
##  Sourcing R script: 05-munge-grb2.R
## 
## Rows: 1056 Columns: 43
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): id, old_id, id_ref, SS, Pos_class, protein, WT_AA, Mut, wt_aa.x, m...
## dbl (23): Pos_real, Pos_ref, Pos, mut_order, f_dg_pred, f_ddg_pred, f_ddg_pr...
## lgl  (5): f_ddg_pred_conf, b_ddg_pred_conf, allosteric, orthosteric, alloste...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1689 Columns: 27
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): id, protein, pca_type, aa_seq, old_id, wt_aa.x, mt_aa, wt_aa.y, at...
## dbl (14): Pos_real, Nham_aa, fitness, sigma, growthrate, growthrate_sigma, c...
## lgl  (1): WT
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pten_abundance <- read.csv("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/vampseq/vampseq_ddg/proteinGym/PTEN_HUMAN_Matreyek_2021.csv", header = TRUE)
pten_activity <- read.csv("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/vampseq/vampseq_ddg/proteinGym/PTEN_HUMAN_Mighell_2018.csv", header = TRUE)

pten_esm <- read.csv("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/vampseq/vampseq_ESM1v/P60484.csv")
nrow(pten_abundance) #5083
## [1] 5083
nrow(pten_activity) #7260
## [1] 7260
nrow(pten_esm) #7657
## [1] 7657
pten_df <- merge(pten_abundance, pten_esm, by.x = "mutant", by.y = "variant") 
pten_df <- pten_df %>% dplyr::select (mutant, DMS_score, DMS_score_bin, ESM.1v) %>% 
  dplyr::rename(DMS_score_abundance = DMS_score,
                DMS_score_bin_abundance = DMS_score_bin)
pten_df <- merge(pten_df, pten_activity, by = "mutant") 
pten_df <- pten_df %>% dplyr::rename(DMS_score_activity = DMS_score,
                                     DMS_score_bin_activity = DMS_score_bin) %>% 
  dplyr::select (mutant, DMS_score_abundance, DMS_score_bin_abundance, ESM.1v,
                 DMS_score_activity, DMS_score_bin_activity)


pten_df <- pten_df %>%
  mutate(mutation_position = as.numeric(str_extract(mutant, "(?<=\\D)(\\d+)(?=\\D)")))
nrow(pten_df) #4839
## [1] 4839
pten_clinvar <- read.csv("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/vampseq/vampseq_clinvar/P60484_clinvar_PTEN.csv")
nrow(pten_clinvar) #7657
## [1] 7657
head(pten_clinvar)
##   variant   V1      Model               Dataset  ddG_pred position wildtype
## 1   A120C 2381 ThermoMPNN AF-P60484-F1-model_v2 0.7477777      119        A
## 2   A120D 2382 ThermoMPNN AF-P60484-F1-model_v2 2.9353685      119        A
## 3   A120E 2383 ThermoMPNN AF-P60484-F1-model_v2 2.7981739      119        A
## 4   A120F 2384 ThermoMPNN AF-P60484-F1-model_v2 1.7522278      119        A
## 5   A120G 2385 ThermoMPNN AF-P60484-F1-model_v2 1.6180537      119        A
## 6   A120H 2386 ThermoMPNN AF-P60484-F1-model_v2 2.7476149      119        A
##   mutation                   pdb chain new_position uniprot exposure_SS
## 1        C AF-P60484-F1-model_v2     A          120  P60484           E
## 2        D AF-P60484-F1-model_v2     A          120  P60484           E
## 3        E AF-P60484-F1-model_v2     A          120  P60484           E
## 4        F AF-P60484-F1-model_v2     A          120  P60484           E
## 5        G AF-P60484-F1-model_v2     A          120  P60484           E
## 6        H AF-P60484-F1-model_v2     A          120  P60484           E
##   exposure_rASA spot_disorder func_esms_residue_class func_esms_variant_class
## 1             0             0                       2                       2
## 2             0             0                       2                       2
## 3             0             0                       2                       2
## 4             0             0                       2                       2
## 5             0             0                       2                       2
## 6             0             0                       2                       2
##   clinvar_clinical_significance
## 1                              
## 2                              
## 3                      conflict
## 4                              
## 5                           VUS
## 6                              
##                                                                                                                                                                                                                                                                                                                                                                                                              sequence
## 1 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 2 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 3 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 4 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 5 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 6 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
pten_df <- merge(pten_df, pten_clinvar, by.x="mutant", by.y = "variant", all.x = TRUE)
nrow(pten_df) #4839
## [1] 4839
pten_var <- read_excel("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/vampseq/vampseq_clinvar/taylor_supplement.xlsx", sheet = "Table S3")
head(pten_var)
## # A tibble: 6 × 34
##   Subject_ID Nucleotide_Change Amino_Acid_Change AA_one_letter Age_Assessment
##        <dbl> <chr>             <chr>             <chr>                  <dbl>
## 1        330 c.716T>C          p.(Met239Thr)     M239T                     26
## 2        370 c.395G>A          p.(Gly132Asp)     G132D                     63
## 3        435 c.406T>C          p.(Cys136Arg)     C136R                     52
## 4        522 c.103A>G          p.(Met35Val)      M35V                      35
## 5        897 c.284C>T          p.(Pro95Leu)      P95L                      17
## 6       1005 c.144C>A          p.(Asn48Lys)      N48K                      39
## # ℹ 29 more variables: Sex <chr>, CC_Score <chr>, Fitness_Score <dbl>,
## #   Abundance_Score <dbl>, Phenotype_List <chr>, Phenotype_Class <chr>,
## #   ASD <lgl>, PHTS <lgl>, Age_Head_Measure <dbl>, `Head_Measure(cm)` <dbl>,
## #   Head_size_expected <dbl>, Head_size_expected_SD <dbl>, Head_z_score <dbl>,
## #   CADD_Score <dbl>, Fitness_score_coded <dbl>, Abundance_score_coded <dbl>,
## #   Quadrant <dbl>, Thyroid_Age_Dx <chr>, Breast_1_Age_Dx <chr>,
## #   Breast_2_Age_Dx <chr>, Melanoma_Age_Dx <chr>, Colorec_Age_Dx <chr>, …
nrow(pten_var) #145
## [1] 145
pten_var <- pten_var %>% dplyr::select(AA_one_letter, Phenotype_Class, Phenotype_List, Fitness_Score, Abundance_Score)
pten_df <- merge(pten_df, pten_var, by.x="mutant", by.y = "AA_one_letter", all.x = TRUE)
nrow(pten_df) #4873
## [1] 4873
pten_gnomad <- read.csv("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/vampseq/vampseq_gnomad/PTEN_gnomAD_v4.1.0_ENST00000371953_2025_04_08_14_29_46.csv")
pten_gnomad <- pten_gnomad %>% filter(VEP.Annotation == "missense_variant")
summary(pten_gnomad$Allele.Frequency)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 6.195e-07 6.197e-07 6.211e-07 2.946e-06 1.859e-06 1.289e-04
#      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
# 6.195e-07 6.197e-07 6.211e-07 2.946e-06 1.859e-06 1.289e-04 
pten_gnomad <-  pten_gnomad %>% dplyr::select(HGVS.Consequence, ClinVar.Germline.Classification,
                                              rsIDs, ClinVar.Variation.ID)

# Apply to the column
aa_map <- c(
  Ala="A", Arg="R", Asn="N", Asp="D", Cys="C",
  Gln="Q", Glu="E", Gly="G", His="H", Ile="I",
  Leu="L", Lys="K", Met="M", Phe="F", Pro="P",
  Ser="S", Thr="T", Trp="W", Tyr="Y", Val="V",
  Ter="*"
)
convert_to_short <- function(consequence) {
  consequence <- gsub("^p\\.", "", consequence)  # Remove 'p.'
  matches <- regmatches(consequence, regexec("([A-Za-z]{3})([0-9]+)([A-Za-z]{3})", consequence))[[1]]
  if (length(matches) == 4) {
    from <- aa_map[[matches[2]]]
    pos <- matches[3]
    to <- aa_map[[matches[4]]]
    return(paste0(from, pos, to))
  } else {
    return(NA)
  }
}
pten_gnomad$ID <- sapply(pten_gnomad$HGVS.Consequence, convert_to_short)

head(pten_gnomad)
##   HGVS.Consequence              ClinVar.Germline.Classification        rsIDs
## 1        p.Ile5Met                                              rs2132145458
## 2        p.Val9Asp                                              rs2132145566
## 3       p.Ser10Gly                       Uncertain significance  rs572685299
## 4       p.Tyr16His                       Uncertain significance rs1064796078
## 5       p.Asp19Gly                       Uncertain significance rs1554890392
## 6       p.Asp22Gly Conflicting classifications of pathogenicity rs1554890398
##   ClinVar.Variation.ID   ID
## 1                   NA  I5M
## 2                   NA  V9D
## 3               428272 S10G
## 4               428195 Y16H
## 5               492333 D19G
## 6               486221 D22G
nrow(pten_gnomad) #235
## [1] 235
length(unique(pten_gnomad$ID))
## [1] 235
pten_gnomad$gnomad <- "gnomad"

pten_df <- merge(pten_df, pten_gnomad, by.x="mutant", by.y = "ID", all.x = TRUE)
nrow(pten_df) #4873
## [1] 4873
head(pten_df)
##   mutant DMS_score_abundance DMS_score_bin_abundance    ESM.1v
## 1  A120E        -0.075192482                       0 -16.59294
## 2  A120F         0.449809072                       0 -16.11391
## 3  A120G         0.639477479                       0 -10.35606
## 4  A120L         0.023241040                       0 -14.00481
## 5  A120P        -0.004816027                       0 -14.67169
## 6  A120Q         0.108151133                       0 -19.15990
##   DMS_score_activity DMS_score_bin_activity mutation_position   V1      Model
## 1         -2.0938569                      0               120 2383 ThermoMPNN
## 2         -3.4270689                      0               120 2384 ThermoMPNN
## 3         -0.0142132                      1               120 2385 ThermoMPNN
## 4         -3.4214305                      0               120 2389 ThermoMPNN
## 5         -3.4173247                      0               120 2392 ThermoMPNN
## 6         -4.5183149                      0               120 2393 ThermoMPNN
##                 Dataset ddG_pred position wildtype mutation
## 1 AF-P60484-F1-model_v2 2.798174      119        A        E
## 2 AF-P60484-F1-model_v2 1.752228      119        A        F
## 3 AF-P60484-F1-model_v2 1.618054      119        A        G
## 4 AF-P60484-F1-model_v2 1.529649      119        A        L
## 5 AF-P60484-F1-model_v2 2.719771      119        A        P
## 6 AF-P60484-F1-model_v2 2.936961      119        A        Q
##                     pdb chain new_position uniprot exposure_SS exposure_rASA
## 1 AF-P60484-F1-model_v2     A          120  P60484           E             0
## 2 AF-P60484-F1-model_v2     A          120  P60484           E             0
## 3 AF-P60484-F1-model_v2     A          120  P60484           E             0
## 4 AF-P60484-F1-model_v2     A          120  P60484           E             0
## 5 AF-P60484-F1-model_v2     A          120  P60484           E             0
## 6 AF-P60484-F1-model_v2     A          120  P60484           E             0
##   spot_disorder func_esms_residue_class func_esms_variant_class
## 1             0                       2                       2
## 2             0                       2                       2
## 3             0                       2                       2
## 4             0                       2                       2
## 5             0                       2                       2
## 6             0                       2                       2
##   clinvar_clinical_significance
## 1                      conflict
## 2                              
## 3                           VUS
## 4                              
## 5                              
## 6                              
##                                                                                                                                                                                                                                                                                                                                                                                                              sequence
## 1 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 2 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 3 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 4 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 5 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 6 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
##   Phenotype_Class
## 1            PHTS
## 2            <NA>
## 3            <NA>
## 4            <NA>
## 5            <NA>
## 6            <NA>
##                                                                                                                                                                                                                 Phenotype_List
## 1 Adenomatous Polyps|Anemia|Benign Neurologic Neoplasms|Gastroesophageal Reflux Disease|Hamartomatous Polyp|Hemangioma of Skin|Inflammatory Polyp|Lipoma|Macrocephaly|Malignant Neoplasm of Prostate|Polyp Hyperplastic Benign
## 2                                                                                                                                                                                                                         <NA>
## 3                                                                                                                                                                                                                         <NA>
## 4                                                                                                                                                                                                                         <NA>
## 5                                                                                                                                                                                                                         <NA>
## 6                                                                                                                                                                                                                         <NA>
##   Fitness_Score Abundance_Score HGVS.Consequence
## 1     -2.093857     -0.07519248             <NA>
## 2            NA              NA             <NA>
## 3            NA              NA             <NA>
## 4            NA              NA             <NA>
## 5            NA              NA             <NA>
## 6            NA              NA             <NA>
##   ClinVar.Germline.Classification rsIDs ClinVar.Variation.ID gnomad
## 1                            <NA>  <NA>                   NA   <NA>
## 2                            <NA>  <NA>                   NA   <NA>
## 3                            <NA>  <NA>                   NA   <NA>
## 4                            <NA>  <NA>                   NA   <NA>
## 5                            <NA>  <NA>                   NA   <NA>
## 6                            <NA>  <NA>                   NA   <NA>
all_positions <- c(
  88:98,    # WPD loop
  160:171,  # TI loop
  123:130,  # P loop
  35:49,    # Arginine loop
  200:212,  # CBR1
  226:238,  # CBR2
  258:268,  # CBR3
  327:335,   # Cα2 loop
  151:174, # membrane-binding alpha-helix
  1:15 # PBM motif 
)

length(pten_df[is.na(pten_df$DMS_score_abundance),] %>% pull(mutant)) #
## [1] 0
pten_df <- pten_df[!is.na(pten_df$DMS_score_abundance), ]
nrow(pten_df) #4873
## [1] 4873
fil_pten_df <- pten_df %>%
  filter(!mutation_position %in% all_positions) %>% distinct(mutant, .keep_all = TRUE)

nrow(fil_pten_df) #3454
## [1] 3454
# Fit a loess model using the filtered data
loess_fit <- loess(DMS_score_activity ~ DMS_score_abundance, data = fil_pten_df, span = 0.7, family = "symmetric")

# Predict fitted values for ALL data points using the loess model trained on fil_pten_df
pten_df$fitted <- predict(loess_fit, newdata = pten_df)

# Calculate residuals for ALL points
pten_df$residuals <-   pten_df$DMS_score_activity - pten_df$fitted
range(pten_df$residuals) #-5.157748  4.754293
## [1] -5.157748  4.754293
fit_line_df <- data.frame(
  DMS_score_abundance = seq(min(fil_pten_df$DMS_score_abundance, na.rm = TRUE),
                            max(fil_pten_df$DMS_score_abundance, na.rm = TRUE),
                            length.out = 200)
)
fit_line_df$DMS_score_activity <- predict(loess_fit, newdata = fit_line_df)

range(pten_df$residuals)
## [1] -5.157748  4.754293
range(pten_df$DMS_score_abundance)
## [1] -0.2776983  1.6652985
range(pten_df$DMS_score_activity)
## [1] -5.754822  2.817851
pten_df <- pten_df %>%
  mutate(mutation_position = as.numeric(str_extract(mutant, "(?<=\\D)(\\d+)(?=\\D)")))

nrow(pten_df)
## [1] 4873
median_residuals <- pten_df %>%
  group_by(mutation_position) %>%
  summarise(median_residuals = median(residuals, na.rm = TRUE))

min(median_residuals$median_residuals) #-4.034455
## [1] -4.034455
max(median_residuals$median_residuals) #1.861488
## [1] 1.861488
pdb <- read.pdb("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/residual_pdb/PTEN/1d5r.pdb")
data <- median_residuals
head(data)
## # A tibble: 6 × 2
##   mutation_position median_residuals
##               <dbl>            <dbl>
## 1                 2            0.728
## 2                 3            1.34 
## 3                 5           -0.224
## 4                 6           -1.19 
## 5                 7            0.579
## 6                 9           -0.625
# Create a new B-factor vector initialized with the current B-factors from the PDB
new_b_factors <- pdb$atom$b

# Loop through each position in the correlation data and update the B-factors
for (i in 1:nrow(data)) {
  position <- data$mutation_position[i]
  correlation_value <- data$median_residuals[i]
  
  # Find indices in the PDB that match the current position
  indices <- which(pdb$atom$resno == position)
  
  # Print the indices and current B-factors before updating
  #cat("Updating residue number:", position, "\n")
  #cat("Indices in PDB:", indices, "\n")
  #cat("Current B-factors:", new_b_factors[indices], "\n")
  
  # Update B-factors for all atoms in the current residue
  new_b_factors[indices] <- correlation_value
  
  # Print the new B-factors after updating
  #cat("Updated B-factors:", new_b_factors[indices], "\n")
  #cat("\n")  # Add an extra line for readability
}
# Replace non-matching B-factors with outlier value so we can filter it out in ChimeraX
non_matching_indices <- setdiff(seq_along(new_b_factors), which(pdb$atom$resno %in% data$mutation_position))
new_b_factors[non_matching_indices] <- 999

# Assign the new B-factors back to the pdb structure
# Write the modified PDB structure to a new file
pdb$atom$b <- new_b_factors
write.pdb(pdb, file = "/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/residual_pdb/PTEN/1d5r_median_residual_loess.pdb")
pten_soma <- read.delim("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/vampseq/vampseq_somatic/All_cBioportal_nonredundant_210417.tsv", header = TRUE, sep = "\t")
pten_soma <- pten_soma %>%
  filter(!grepl("\\*", Protein.Change))
pten_soma$somatic <- "Somatic"
head(pten_soma)
##                                                     Study         Sample.ID
## 1                  Acral Melanoma (TGEN, Genome Res 2017)       148336T-25b
## 2       Non-Small Cell Cancer (MSKCC, Cancer Discov 2017) P-0003863-T01-IM5
## 3       Non-Small Cell Cancer (MSKCC, Cancer Discov 2017) P-0009431-T01-IM5
## 4 Uterine Sarcoma/Mesenchymal (MSK, Clin Cancer Res 2020) P-0001821-T01-IM3
## 5                  Pan-Lung Cancer (TCGA, Nat Genet 2016)   TCGA-39-5027-01
## 6                  Pan-Lung Cancer (TCGA, Nat Genet 2016)   TCGA-21-5783-01
##   Cancer.Type Protein.Change
## 1                      R130G
## 2                      R130Q
## 3                      R130G
## 4                      R130Q
## 5                      R130Q
## 6                      R130Q
##                                                                                                         Annotation
## 1                   OncoKB: Oncogenic, level_4;CIViC: NA;MyCancerGenome: present;CancerHotspot: yes;3DHotspot: yes
## 2 OncoKB: Likely Oncogenic, level_4;CIViC: Functional: 1;MyCancerGenome: present;CancerHotspot: yes;3DHotspot: yes
## 3                   OncoKB: Oncogenic, level_4;CIViC: NA;MyCancerGenome: present;CancerHotspot: yes;3DHotspot: yes
## 4 OncoKB: Likely Oncogenic, level_4;CIViC: Functional: 1;MyCancerGenome: present;CancerHotspot: yes;3DHotspot: yes
## 5 OncoKB: Likely Oncogenic, level_4;CIViC: Functional: 1;MyCancerGenome: present;CancerHotspot: yes;3DHotspot: yes
## 6 OncoKB: Likely Oncogenic, level_4;CIViC: Functional: 1;MyCancerGenome: present;CancerHotspot: yes;3DHotspot: yes
##                                                                                                                         Functional.Impact
## 1    MutationAssessor: impact: high, score: 4.165;SIFT: impact: deleterious, score: 0;Polyphen-2: impact: probably_damaging, score: 0.999
## 2 MutationAssessor: impact: high, score: 3.615;SIFT: impact: deleterious, score: 0.02;Polyphen-2: impact: probably_damaging, score: 0.998
## 3    MutationAssessor: impact: high, score: 4.165;SIFT: impact: deleterious, score: 0;Polyphen-2: impact: probably_damaging, score: 0.999
## 4 MutationAssessor: impact: high, score: 3.615;SIFT: impact: deleterious, score: 0.02;Polyphen-2: impact: probably_damaging, score: 0.998
## 5 MutationAssessor: impact: high, score: 3.615;SIFT: impact: deleterious, score: 0.02;Polyphen-2: impact: probably_damaging, score: 0.998
## 6 MutationAssessor: impact: high, score: 3.615;SIFT: impact: deleterious, score: 0.02;Polyphen-2: impact: probably_damaging, score: 0.998
##       Mutation.Type Variant.Type     Copy.. COSMIC      MS      VS
## 1 Missense_Mutation          SNP    diploid    222    <NA>    <NA>
## 2 Missense_Mutation          SNP    diploid    222 SOMATIC    <NA>
## 3 Missense_Mutation          SNP    diploid    222 SOMATIC    <NA>
## 4 Missense_Mutation          SNP    diploid    222 SOMATIC Unknown
## 5 Missense_Mutation          SNP shallowdel    222 Somatic    <NA>
## 6 Missense_Mutation          SNP shallowdel    222 Somatic    <NA>
##          Center Chromosome Start.Pos  End.Pos Ref Var            HGVSg
## 1             .         10  89692904 89692904   C   G 10:g.89692904C>G
## 2         MSKCC         10  89692905 89692905   G   A 10:g.89692905G>A
## 3         MSKCC         10  89692904 89692904   C   G 10:g.89692904C>G
## 4         MSKCC         10  89692905 89692905   G   A 10:g.89692905G>A
## 5 broad.mit.edu         10  89692905 89692905   G   A 10:g.89692905G>A
## 6 broad.mit.edu         10  89692905 89692905   G   A 10:g.89692905G>A
##                        HGVSc Allele.Freq..T. Allele.Freq..N. Variant.Reads
## 1 ENST00000371953.3:c.388C>G              NA              NA            NA
## 2 ENST00000371953.3:c.389G>A              NA              NA            NA
## 3 ENST00000371953.3:c.388C>G              NA              NA            NA
## 4 ENST00000371953.3:c.389G>A            0.35              NA           174
## 5 ENST00000371953.3:c.389G>A            0.23              NA            17
## 6 ENST00000371953.3:c.389G>A            0.42              NA            39
##   Ref.Reads Variant.Reads..Normal. Ref.Reads..Normal. X..Mut.in.Sample Exon
## 1        NA                     NA                 NA              357  5/9
## 2        NA                     NA                 NA                8  5/9
## 3        NA                     NA                 NA                9  5/9
## 4       318                     NA                275               65  5/9
## 5        56                     NA                 81              295  5/9
## 6        53                     NA                186              363  5/9
##   gnomAD                                                     ClinVar
## 1     NA                               Likely pathogenic, Pathogenic
## 2     NA Pathogenic, Likely pathogenic, Pathogenic/Likely pathogenic
## 3     NA                               Likely pathogenic, Pathogenic
## 4     NA Pathogenic, Likely pathogenic, Pathogenic/Likely pathogenic
## 5     NA Pathogenic, Likely pathogenic, Pathogenic/Likely pathogenic
## 6     NA Pathogenic, Likely pathogenic, Pathogenic/Likely pathogenic
##         dbSNP somatic
## 1 rs121909224 Somatic
## 2 rs121909229 Somatic
## 3 rs121909224 Somatic
## 4 rs121909229 Somatic
## 5 rs121909229 Somatic
## 6 rs121909229 Somatic
nrow(pten_soma) #2610
## [1] 2610
table(pten_soma$Annotation)
## 
##                                                OncoKB: Inconclusive, level NA;CIViC: NA;MyCancerGenome: not present;CancerHotspot: no;3DHotspot: no 
##                                                                                                                                                   2 
##                                              OncoKB: Inconclusive, level NA;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes 
##                                                                                                                                                   6 
##                                              OncoKB: Likely Neutral, level NA;CIViC: NA;MyCancerGenome: not present;CancerHotspot: no;3DHotspot: no 
##                                                                                                                                                   2 
##                                    OncoKB: Likely Oncogenic, level_4;CIViC: Functional: 1;MyCancerGenome: present;CancerHotspot: yes;3DHotspot: yes 
##                                                                                                                                                 161 
##                                             OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: no;3DHotspot: no 
##                                                                                                                                                 309 
##                                            OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: no;3DHotspot: yes 
##                                                                                                                                                  83 
##                                            OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: no 
##                                                                                                                                                 358 
##                                           OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes 
##                                                                                                                                                 274 
##                                                 OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: present;CancerHotspot: no;3DHotspot: no 
##                                                                                                                                                   2 
##                                                OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: present;CancerHotspot: no;3DHotspot: yes 
##                                                                                                                                                  14 
##                                                OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: present;CancerHotspot: yes;3DHotspot: no 
##                                                                                                                                                  16 
##                                               OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: present;CancerHotspot: yes;3DHotspot: yes 
##                                                                                                                                                  11 
## OncoKB: Likely Oncogenic, level_4;CIViC: Oncogenic: 1, Diagnostic: 2, Predisposing: 2;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes 
##                                                                                                                                                  48 
##                                                 OncoKB: Oncogenic, level NA;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes 
##                                                                                                                                                   1 
##                                        OncoKB: Oncogenic, level_4;CIViC: Functional: 1;MyCancerGenome: not present;CancerHotspot: no;3DHotspot: yes 
##                                                                                                                                                  13 
##                                                    OncoKB: Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: no;3DHotspot: no 
##                                                                                                                                                  89 
##                                                   OncoKB: Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: no;3DHotspot: yes 
##                                                                                                                                                  53 
##                                                   OncoKB: Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: no 
##                                                                                                                                                  43 
##                                                  OncoKB: Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes 
##                                                                                                                                                 203 
##                                                      OncoKB: Oncogenic, level_4;CIViC: NA;MyCancerGenome: present;CancerHotspot: yes;3DHotspot: yes 
##                                                                                                                                                 103 
##                                        OncoKB: Predicted Oncogenic, level NA;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: no 
##                                                                                                                                                   1 
##                                         OncoKB: Predicted Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: no 
##                                                                                                                                                  36 
##                                        OncoKB: Predicted Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes 
##                                                                                                                                                  98 
##                                                     OncoKB: Unknown, level NA;CIViC: NA;MyCancerGenome: not present;CancerHotspot: no;3DHotspot: no 
##                                                                                                                                                 588 
##                                                    OncoKB: Unknown, level NA;CIViC: NA;MyCancerGenome: not present;CancerHotspot: no;3DHotspot: yes 
##                                                                                                                                                  92 
##                                                   OncoKB: Unknown, level NA;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes 
##                                                                                                                                                   2 
##                                                         OncoKB: Unknown, level NA;CIViC: NA;MyCancerGenome: present;CancerHotspot: no;3DHotspot: no 
##                                                                                                                                                   2
pten_soma_hotspot <- pten_soma[grepl("CancerHotspot: yes", pten_soma$Annotation), ]
pten_soma_hotspot_unique <- pten_soma_hotspot %>% distinct(Protein.Change, .keep_all = TRUE)
nrow(pten_soma_hotspot_unique) #144
## [1] 144
nrow(pten_df) #4873
## [1] 4873
pten_df_clean <- pten_df %>% distinct(mutant, .keep_all = TRUE)
nrow(pten_df_clean) #4839
## [1] 4839
length(intersect(pten_soma_hotspot$Protein.Change, pten_df_clean$mutant)) #93
## [1] 93
length(setdiff(pten_soma_hotspot$Protein.Change, pten_df_clean$mutant)) #51
## [1] 51
pten_df_clean$somatic <- ifelse(
  pten_df_clean$mutant %in% pten_soma_hotspot$Protein.Change,
  "Somatic",
  NA
)
table(pten_df_clean$somatic)
## 
## Somatic 
##      93
pten_df <- merge(pten_df, pten_soma_hotspot, by.x="mutant", by.y = "Protein.Change", all.x = TRUE)
nrow(pten_df) 
## [1] 6674
pten_df <- pten_df %>% distinct(mutant, .keep_all = TRUE)

table(pten_df$somatic)
## 
## Somatic 
##      93
pten_gnomad <- pten_df %>% filter(!is.na(HGVS.Consequence)) %>% 
  filter(!clinvar_clinical_significance %in% c("likely_pathogenic", "pathogenic", "conflict", "likely_risk_allele", "uncertain_risk_allele", "VUS")) %>%
  filter(!ClinVar.Germline.Classification %in% c("Pathogenic", "Pathogenic/Likely pathogenic", "Likely pathogenic",
                                                 "Conflicting classifications of pathogenicity", "Likely pathogenic/Likely risk allele",
                                                 "Likely risk allele", "Pathogenic/Likely pathogenic/Likely risk allele",
                                                 "Uncertain significance", "Uncertain significance/Uncertain risk allele")) %>%
  filter(is.na(Phenotype_Class)) %>%
  filter(is.na(somatic))

nrow(pten_gnomad) #37
## [1] 37
length(unique(pten_gnomad$mutant)) #37
## [1] 37
pten_gnomad$var_source <- "gnomAD"


pten_somatic <- merge(pten_df, pten_soma_hotspot, by.x="mutant", by.y = "Protein.Change")
nrow(pten_somatic) #
## [1] 788
length(unique(pten_somatic$mutant)) #93
## [1] 93
pten_somatic <- pten_somatic %>% distinct(mutant, .keep_all = TRUE)
head(pten_somatic)
##   mutant DMS_score_abundance DMS_score_bin_abundance    ESM.1v
## 1  A126G           1.1031274                       1 -10.59961
## 2  A126S           1.1589273                       1 -13.37786
## 3  A126T           0.9698197                       1 -14.54880
## 4  C105G           0.3516079                       0 -13.94637
## 5  C105R           0.1588046                       0 -13.99142
## 6  C105S           0.2445279                       0 -10.05558
##   DMS_score_activity DMS_score_bin_activity mutation_position   V1      Model
## 1        -0.82914446                      1               126 2505 ThermoMPNN
## 2         0.06133791                      1               126 2515 ThermoMPNN
## 3        -1.93558841                      0               126 2516 ThermoMPNN
## 4        -4.83784503                      0               105 2085 ThermoMPNN
## 5        -3.78634247                      0               105 2094 ThermoMPNN
## 6        -1.05959009                      1               105 2095 ThermoMPNN
##                 Dataset  ddG_pred position wildtype mutation
## 1 AF-P60484-F1-model_v2 0.6510435      125        A        G
## 2 AF-P60484-F1-model_v2 0.2785859      125        A        S
## 3 AF-P60484-F1-model_v2 0.4760755      125        A        T
## 4 AF-P60484-F1-model_v2 1.0596524      104        C        G
## 5 AF-P60484-F1-model_v2 1.5254858      104        C        R
## 6 AF-P60484-F1-model_v2 0.4113229      104        C        S
##                     pdb chain new_position uniprot exposure_SS exposure_rASA
## 1 AF-P60484-F1-model_v2     A          126  P60484           S          0.33
## 2 AF-P60484-F1-model_v2     A          126  P60484           S          0.33
## 3 AF-P60484-F1-model_v2     A          126  P60484           S          0.33
## 4 AF-P60484-F1-model_v2     A          105  P60484           H          0.00
## 5 AF-P60484-F1-model_v2     A          105  P60484           H          0.00
## 6 AF-P60484-F1-model_v2     A          105  P60484           H          0.00
##   spot_disorder func_esms_residue_class func_esms_variant_class
## 1             0                       1                       1
## 2             0                       1                       1
## 3             0                       1                       1
## 4             0                       2                       2
## 5             0                       2                       2
## 6             0                       2                       1
##   clinvar_clinical_significance
## 1                           VUS
## 2                           VUS
## 3             likely_pathogenic
## 4             likely_pathogenic
## 5                              
## 6                              
##                                                                                                                                                                                                                                                                                                                                                                                                              sequence
## 1 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 2 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 3 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 4 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 5 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 6 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
##   Phenotype_Class Phenotype_List Fitness_Score Abundance_Score HGVS.Consequence
## 1            <NA>           <NA>            NA              NA             <NA>
## 2            <NA>           <NA>            NA              NA             <NA>
## 3            <NA>           <NA>            NA              NA             <NA>
## 4            <NA>           <NA>            NA              NA      p.Cys105Gly
## 5            <NA>           <NA>            NA              NA             <NA>
## 6            <NA>           <NA>            NA              NA             <NA>
##   ClinVar.Germline.Classification        rsIDs ClinVar.Variation.ID gnomad
## 1                            <NA>         <NA>                   NA   <NA>
## 2                            <NA>         <NA>                   NA   <NA>
## 3                            <NA>         <NA>                   NA   <NA>
## 4               Likely pathogenic rs1303165645              1728081 gnomad
## 5                            <NA>         <NA>                   NA   <NA>
## 6                            <NA>         <NA>                   NA   <NA>
##       fitted  residuals
## 1 -0.1823781 -0.6467664
## 2 -0.1908700  0.2522079
## 3 -0.1785532 -1.7570352
## 4 -1.3357357 -3.5021093
## 5 -2.3737286 -1.4126139
## 6 -1.9276639  0.8680738
##                                                        Study.x
## 1            Breast Invasive Carcinoma (TCGA, PanCancer Atlas)
## 2                        Breast Cancer (MSK, Cancer Cell 2018)
## 3 Uterine Corpus Endometrial Carcinoma (TCGA, PanCancer Atlas)
## 4                                  Melanomas (TCGA, Cell 2015)
## 5      Breast Cancer (METABRIC, Nature 2012 & Nat Commun 2016)
## 6  MSK-IMPACT Clinical Sequencing Cohort (MSKCC, Nat Med 2017)
##         Sample.ID.x                  Cancer.Type.x
## 1   TCGA-LL-A5YP-01                               
## 2 P-0000422-T02-IM3                               
## 3   TCGA-EY-A1GU-01                               
## 4   TCGA-D3-A2J8-06                               
## 5           MB-6143                               
## 6 P-0002672-T01-IM3 Gastrointestinal Stromal Tumor
##                                                                                                Annotation.x
## 1        OncoKB: Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## 2        OncoKB: Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## 3 OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## 4 OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## 5 OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## 6 OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
##                                                                                                                         Functional.Impact.x
## 1       MutationAssessor: impact: low, score: 1.885;SIFT: impact: deleterious, score: 0;Polyphen-2: impact: probably_damaging, score: 0.998
## 2     MutationAssessor: impact: medium, score: 3.39;SIFT: impact: deleterious, score: 0;Polyphen-2: impact: probably_damaging, score: 0.998
## 3      MutationAssessor: impact: medium, score: 2.9;SIFT: impact: deleterious, score: 0;Polyphen-2: impact: probably_damaging, score: 0.999
## 4 MutationAssessor: impact: medium, score: 3.465;SIFT: impact: deleterious, score: 0.01;Polyphen-2: impact: probably_damaging, score: 0.998
## 5       MutationAssessor: impact: high, score: 3.81;SIFT: impact: deleterious, score: 0;Polyphen-2: impact: probably_damaging, score: 0.999
## 6     MutationAssessor: impact: medium, score: 2.53;SIFT: impact: deleterious, score: 0;Polyphen-2: impact: probably_damaging, score: 0.998
##     Mutation.Type.x Variant.Type.x Copy...x COSMIC.x    MS.x    VS.x
## 1 Missense_Mutation            SNP                18       .       .
## 2 Missense_Mutation            SNP                18 UNKNOWN Unknown
## 3 Missense_Mutation            SNP                18       .       .
## 4 Missense_Mutation            SNP                21 Somatic    <NA>
## 5 Missense_Mutation            SNP                21    <NA>    <NA>
## 6 Missense_Mutation            SNP  diploid       21    <NA>    <NA>
##        Center.x Chromosome.x Start.Pos.x End.Pos.x Ref.x Var.x          HGVSg.x
## 1             .           10    89692893  89692893     C     G 10:g.89692893C>G
## 2         MSKCC           10    89692892  89692892     G     T 10:g.89692892G>T
## 3             .           10    89692892  89692892     G     A 10:g.89692892G>A
## 4 broad.mit.edu           10    89692829  89692829     T     G 10:g.89692829T>G
## 5      METABRIC           10    89692829  89692829     T     C 10:g.89692829T>C
## 6          <NA>           10    89692829  89692829     T     A 10:g.89692829T>A
##                      HGVSc.x Allele.Freq..T..x Allele.Freq..N..x
## 1 ENST00000371953.3:c.377C>G              0.50                NA
## 2 ENST00000371953.3:c.376G>T              0.48            0.0036
## 3 ENST00000371953.3:c.376G>A              0.46                NA
## 4 ENST00000371953.3:c.313T>G              0.16                NA
## 5 ENST00000371953.3:c.313T>C                NA                NA
## 6 ENST00000371953.3:c.313T>A              0.32                NA
##   Variant.Reads.x Ref.Reads.x Variant.Reads..Normal..x Ref.Reads..Normal..x
## 1              27          27                       NA                   82
## 2             105         115                        1                  275
## 3              43          51                       NA                   68
## 4              11          56                       NA                   NA
## 5              NA          NA                       NA                   NA
## 6              44          93                       NA                   NA
##   X..Mut.in.Sample.x Exon.x gnomAD.x                                 ClinVar.x
## 1                 64    5/9       NA                                          
## 2                  7    5/9       NA                    Uncertain significance
## 3               1179    5/9       NA Uncertain significance, Likely pathogenic
## 4                507    5/9       NA                                          
## 5                  6    5/9       NA                                          
## 6                 16    5/9       NA                                          
##        dbSNP.x somatic.x
## 1                Somatic
## 2 rs1554898129   Somatic
## 3 rs1554898129   Somatic
## 4                Somatic
## 5                Somatic
## 6                Somatic
##                                                        Study.y
## 1            Breast Invasive Carcinoma (TCGA, PanCancer Atlas)
## 2                        Breast Cancer (MSK, Cancer Cell 2018)
## 3 Uterine Corpus Endometrial Carcinoma (TCGA, PanCancer Atlas)
## 4                                  Melanomas (TCGA, Cell 2015)
## 5      Breast Cancer (METABRIC, Nature 2012 & Nat Commun 2016)
## 6  MSK-IMPACT Clinical Sequencing Cohort (MSKCC, Nat Med 2017)
##         Sample.ID.y                  Cancer.Type.y
## 1   TCGA-LL-A5YP-01                               
## 2 P-0000422-T02-IM3                               
## 3   TCGA-EY-A1GU-01                               
## 4   TCGA-D3-A2J8-06                               
## 5           MB-6143                               
## 6 P-0002672-T01-IM3 Gastrointestinal Stromal Tumor
##                                                                                                Annotation.y
## 1        OncoKB: Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## 2        OncoKB: Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## 3 OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## 4 OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## 5 OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
## 6 OncoKB: Likely Oncogenic, level_4;CIViC: NA;MyCancerGenome: not present;CancerHotspot: yes;3DHotspot: yes
##                                                                                                                         Functional.Impact.y
## 1       MutationAssessor: impact: low, score: 1.885;SIFT: impact: deleterious, score: 0;Polyphen-2: impact: probably_damaging, score: 0.998
## 2     MutationAssessor: impact: medium, score: 3.39;SIFT: impact: deleterious, score: 0;Polyphen-2: impact: probably_damaging, score: 0.998
## 3      MutationAssessor: impact: medium, score: 2.9;SIFT: impact: deleterious, score: 0;Polyphen-2: impact: probably_damaging, score: 0.999
## 4 MutationAssessor: impact: medium, score: 3.465;SIFT: impact: deleterious, score: 0.01;Polyphen-2: impact: probably_damaging, score: 0.998
## 5       MutationAssessor: impact: high, score: 3.81;SIFT: impact: deleterious, score: 0;Polyphen-2: impact: probably_damaging, score: 0.999
## 6     MutationAssessor: impact: medium, score: 2.53;SIFT: impact: deleterious, score: 0;Polyphen-2: impact: probably_damaging, score: 0.998
##     Mutation.Type.y Variant.Type.y Copy...y COSMIC.y    MS.y    VS.y
## 1 Missense_Mutation            SNP                18       .       .
## 2 Missense_Mutation            SNP                18 UNKNOWN Unknown
## 3 Missense_Mutation            SNP                18       .       .
## 4 Missense_Mutation            SNP                21 Somatic    <NA>
## 5 Missense_Mutation            SNP                21    <NA>    <NA>
## 6 Missense_Mutation            SNP  diploid       21    <NA>    <NA>
##        Center.y Chromosome.y Start.Pos.y End.Pos.y Ref.y Var.y          HGVSg.y
## 1             .           10    89692893  89692893     C     G 10:g.89692893C>G
## 2         MSKCC           10    89692892  89692892     G     T 10:g.89692892G>T
## 3             .           10    89692892  89692892     G     A 10:g.89692892G>A
## 4 broad.mit.edu           10    89692829  89692829     T     G 10:g.89692829T>G
## 5      METABRIC           10    89692829  89692829     T     C 10:g.89692829T>C
## 6          <NA>           10    89692829  89692829     T     A 10:g.89692829T>A
##                      HGVSc.y Allele.Freq..T..y Allele.Freq..N..y
## 1 ENST00000371953.3:c.377C>G              0.50                NA
## 2 ENST00000371953.3:c.376G>T              0.48            0.0036
## 3 ENST00000371953.3:c.376G>A              0.46                NA
## 4 ENST00000371953.3:c.313T>G              0.16                NA
## 5 ENST00000371953.3:c.313T>C                NA                NA
## 6 ENST00000371953.3:c.313T>A              0.32                NA
##   Variant.Reads.y Ref.Reads.y Variant.Reads..Normal..y Ref.Reads..Normal..y
## 1              27          27                       NA                   82
## 2             105         115                        1                  275
## 3              43          51                       NA                   68
## 4              11          56                       NA                   NA
## 5              NA          NA                       NA                   NA
## 6              44          93                       NA                   NA
##   X..Mut.in.Sample.y Exon.y gnomAD.y                                 ClinVar.y
## 1                 64    5/9       NA                                          
## 2                  7    5/9       NA                    Uncertain significance
## 3               1179    5/9       NA Uncertain significance, Likely pathogenic
## 4                507    5/9       NA                                          
## 5                  6    5/9       NA                                          
## 6                 16    5/9       NA                                          
##        dbSNP.y somatic.y
## 1                Somatic
## 2 rs1554898129   Somatic
## 3 rs1554898129   Somatic
## 4                Somatic
## 5                Somatic
## 6                Somatic
pten_somatic <- pten_somatic %>% 
  filter(!clinvar_clinical_significance %in% c("likely_pathogenic", "pathogenic", "conflict", "likely_risk_allele", "uncertain_risk_allele")) %>%
  filter(!ClinVar.Germline.Classification %in% c("Pathogenic", "Pathogenic/Likely pathogenic", "Likely pathogenic",
                                                 "Conflicting classifications of pathogenicity", "Likely pathogenic/Likely risk allele",
                                                 "Likely risk allele", "Pathogenic/Likely pathogenic/Likely risk allele",
                                                 "Uncertain significance", "Uncertain significance/Uncertain risk allele")) %>%
  filter(is.na(Phenotype_Class))
nrow(pten_somatic) #37
## [1] 37
pten_somatic$var_source <- "Somatic"

colnames(pten_somatic)
##  [1] "mutant"                          "DMS_score_abundance"            
##  [3] "DMS_score_bin_abundance"         "ESM.1v"                         
##  [5] "DMS_score_activity"              "DMS_score_bin_activity"         
##  [7] "mutation_position"               "V1"                             
##  [9] "Model"                           "Dataset"                        
## [11] "ddG_pred"                        "position"                       
## [13] "wildtype"                        "mutation"                       
## [15] "pdb"                             "chain"                          
## [17] "new_position"                    "uniprot"                        
## [19] "exposure_SS"                     "exposure_rASA"                  
## [21] "spot_disorder"                   "func_esms_residue_class"        
## [23] "func_esms_variant_class"         "clinvar_clinical_significance"  
## [25] "sequence"                        "Phenotype_Class"                
## [27] "Phenotype_List"                  "Fitness_Score"                  
## [29] "Abundance_Score"                 "HGVS.Consequence"               
## [31] "ClinVar.Germline.Classification" "rsIDs"                          
## [33] "ClinVar.Variation.ID"            "gnomad"                         
## [35] "fitted"                          "residuals"                      
## [37] "Study.x"                         "Sample.ID.x"                    
## [39] "Cancer.Type.x"                   "Annotation.x"                   
## [41] "Functional.Impact.x"             "Mutation.Type.x"                
## [43] "Variant.Type.x"                  "Copy...x"                       
## [45] "COSMIC.x"                        "MS.x"                           
## [47] "VS.x"                            "Center.x"                       
## [49] "Chromosome.x"                    "Start.Pos.x"                    
## [51] "End.Pos.x"                       "Ref.x"                          
## [53] "Var.x"                           "HGVSg.x"                        
## [55] "HGVSc.x"                         "Allele.Freq..T..x"              
## [57] "Allele.Freq..N..x"               "Variant.Reads.x"                
## [59] "Ref.Reads.x"                     "Variant.Reads..Normal..x"       
## [61] "Ref.Reads..Normal..x"            "X..Mut.in.Sample.x"             
## [63] "Exon.x"                          "gnomAD.x"                       
## [65] "ClinVar.x"                       "dbSNP.x"                        
## [67] "somatic.x"                       "Study.y"                        
## [69] "Sample.ID.y"                     "Cancer.Type.y"                  
## [71] "Annotation.y"                    "Functional.Impact.y"            
## [73] "Mutation.Type.y"                 "Variant.Type.y"                 
## [75] "Copy...y"                        "COSMIC.y"                       
## [77] "MS.y"                            "VS.y"                           
## [79] "Center.y"                        "Chromosome.y"                   
## [81] "Start.Pos.y"                     "End.Pos.y"                      
## [83] "Ref.y"                           "Var.y"                          
## [85] "HGVSg.y"                         "HGVSc.y"                        
## [87] "Allele.Freq..T..y"               "Allele.Freq..N..y"              
## [89] "Variant.Reads.y"                 "Ref.Reads.y"                    
## [91] "Variant.Reads..Normal..y"        "Ref.Reads..Normal..y"           
## [93] "X..Mut.in.Sample.y"              "Exon.y"                         
## [95] "gnomAD.y"                        "ClinVar.y"                      
## [97] "dbSNP.y"                         "somatic.y"                      
## [99] "var_source"
pten_patho <- pten_df %>% filter(clinvar_clinical_significance %in% c("likely_pathogenic", "pathogenic"))
nrow(pten_patho) #
## [1] 115
length(unique(pten_patho$mutant)) #115 
## [1] 115
pten_patho <- pten_patho %>% distinct(mutant, .keep_all = TRUE)
nrow(pten_patho) #115
## [1] 115
pten_patho$var_source <- "Pathogenic"
p1 <- ggplot(pten_df, aes(x = DMS_score_abundance, y = DMS_score_activity, color = residuals)) +
  geom_point(size = 2, alpha = 0.35) +
  geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
  geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
  geom_line(data = fit_line_df, aes(x = DMS_score_abundance, y = DMS_score_activity),
            inherit.aes = FALSE, color = "black", linewidth = 1) +
    geom_text_repel(data = subset(pten_df, mutant %in% c("R130G", "C124R")),
                  aes(label = mutant),
                  size = 4,color = "black",
                  max.overlaps = Inf, box.padding = 0.4, point.padding = 0.3
  ) +
  geom_point(data = subset(pten_df, mutant %in% c("R130G", "C124R")),
             aes(x = DMS_score_abundance, y = DMS_score_activity),
             shape = 17, color = "black", size = 3) +  # Triangle shape
  labs(
    title = "All 4839 Variants",
    x = "Measured abundance",
    y = "Measured activity",
    color = "Residuals"
  ) +
  theme_classic() +
  ylim(-5.8, 2.9) + xlim(-0.3, 1.7) +
  scale_color_viridis(option = "C", direction = 1, limits = c(-5.2, 5)) +
  theme(legend.position = "none")

# Add marginal density plots
p1 <- ggMarginal(
  p1,
  type = "density",
  margins = "both",
  groupColour = FALSE,
  groupFill = FALSE,
  size = 10,
  colour = "grey",
  fill = "lightgrey"
)

nrow(pten_gnomad)
## [1] 37
p2 <- ggplot(pten_gnomad, aes(x = DMS_score_abundance, y = DMS_score_activity, color = residuals)) +
  geom_point(size = 2) +
  geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
  geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
  geom_line(data = fit_line_df, aes(x = DMS_score_abundance, y = DMS_score_activity),
            inherit.aes = FALSE, color = "black", linewidth = 1) +
  labs(
    title = "gnomAD Variants",
    x = "Abundance",
    y = "Activity",
    color = "Loess residuals"
  ) +
  theme_classic() +
  ylim(-5.8, 2.9) + xlim(-0.3, 1.7) +
  scale_color_viridis(option = "C", direction = 1, limits = c(-5.2, 5)) +
  theme(legend.position = "none") +
    geom_text_repel(data = subset(pten_df, mutant %in% c("A79T")),
                  aes(label = mutant),
                  size = 4,color = "black",
                  max.overlaps = Inf, box.padding = 0.4, point.padding = 0.3
  ) +
  geom_point(data = subset(pten_df, mutant %in% c("A79T")),
             aes(x = DMS_score_abundance, y = DMS_score_activity),
             color = "black", size = 2) 

# Add marginal density plots
p2 <- ggMarginal(
  p2,
  type = "density",
  margins = "both",
  groupColour = FALSE,
  groupFill = FALSE,
  size = 10,
  colour = "grey",
  fill = "lightgrey"
)


nrow(pten_patho)
## [1] 115
p3 <- ggplot(pten_patho, aes(x = DMS_score_abundance, y = DMS_score_activity, color = residuals)) +
  geom_point(size = 2, shape = 17) +
  geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
  geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
  geom_line(data = fit_line_df, aes(x = DMS_score_abundance, y = DMS_score_activity),
            inherit.aes = FALSE, color = "black", linewidth = 1) +
      geom_text_repel(data = subset(pten_df, mutant %in% c("R130G", "C124R")),
                  aes(label = mutant),
                  size = 4,color = "black",
                  max.overlaps = Inf, box.padding = 0.4, point.padding = 0.3
  ) +
  geom_point(data = subset(pten_df, mutant %in% c("R130G", "C124R")),
             aes(x = DMS_score_abundance, y = DMS_score_activity),
             shape = 17, color = "black", size = 3) +  # Triangle shape
  labs(
    title = "ClinVar Pathogenic Variants",
    x = "Abundance",
    y = "Activity",
    color = "Loess residuals"
  ) +
  theme_classic() +
  ylim(-5.8, 2.9) + xlim(-0.3, 1.7) +
  scale_color_viridis(option = "C", direction = 1, limits = c(-5.2, 5)) +
  theme(legend.position = "none")

# Add marginal density plots
p3 <- ggMarginal(
  p3,
  type = "density",
  margins = "both",
  groupColour = FALSE,
  groupFill = FALSE,
  size = 10,
  colour = "grey",
  fill = "lightgrey"
)

p4 <- plot_grid(p1,p2,p3,ncol=3, nrow=1)

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig4_scatter1.pdf", 
       plot = p4, width = 9, height = 3, dpi = 300)
p4

ggplot(pten_df, aes(x = DMS_score_abundance, y = DMS_score_activity, color = residuals)) +
  geom_point(size = 2, alpha = 0.35) +
  geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
  geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
  geom_line(data = fit_line_df, aes(x = DMS_score_abundance, y = DMS_score_activity),
            inherit.aes = FALSE, color = "black", linewidth = 1) +
    geom_text_repel(data = subset(pten_df, mutant %in% c("R130G", "C124R")),
                  aes(label = mutant),
                  size = 4,color = "black",
                  max.overlaps = Inf, box.padding = 0.4, point.padding = 0.3
  ) +
  geom_point(data = subset(pten_df, mutant %in% c("R130G", "C124R")),
             aes(x = DMS_score_abundance, y = DMS_score_activity),
             shape = 17, color = "black", size = 3) +  # Triangle shape
  labs(
    title = "All 4839 Variants",
    x = "Measured abundance",
    y = "Measured activity",
    color = "Residuals"
  ) +
  theme_classic() +
  ylim(-5.8, 2.9) + xlim(-0.3, 1.7) +
  scale_color_viridis(option = "C", direction = 1, limits = c(-4.8, 5.2)) +
  theme(legend.position = "bottom")

# 1. Set up
set.seed(123)
n_boot <- 1000
match_window <- 0.5
n_patho <- nrow(pten_patho)

# 2. Get abundance/residuals from pten_patho
patho_df <- pten_patho %>% dplyr::select(DMS_score_abundance, residuals)
patho_df$group <- "Pathogenic"

# 3. Bootstrap sampling from non-patho pool with abundance matching
non_patho_pool <- pten_df %>%
  filter(!(mutant %in% pten_patho$mutant)) %>%
  filter(!(mutant %in% pten_somatic$mutant)) %>% dplyr::select(DMS_score_abundance, residuals)

bootstrap_medians <- vector("numeric", length = n_boot)

# Pre-group non-patho pool into bins
non_patho_pool <- non_patho_pool %>%
  mutate(bin = cut(DMS_score_abundance, breaks = seq(-0.5, 2, by = match_window)))

# Bin pathogenic variants accordingly
patho_df <- patho_df %>%
  mutate(bin = cut(DMS_score_abundance, breaks = seq(-0.5, 2, by = match_window)))

# Create lookup table for fast sampling
bin_lookup <- split(non_patho_pool$residuals, non_patho_pool$bin)

# Bootstrap matrix
bootstrap_matrix <- matrix(NA, nrow = n_boot, ncol = n_patho)

for (i in 1:n_boot) {
  for (j in 1:n_patho) {
    bin_j <- patho_df$bin[j]
    candidates <- bin_lookup[[as.character(bin_j)]]
    if (!is.null(candidates) && length(candidates) > 0) {
      bootstrap_matrix[i, j] <- sample(candidates, 1)
    }
  }
}

# Summarize into a dataframe
boot_df <- data.frame(
  group = "Random abundance-matched",
  residuals = apply(bootstrap_matrix, 1, median, na.rm = TRUE)
)

# Combine with patho residuals
plot_df <- bind_rows(
  patho_df %>% dplyr::select(group, residuals),
  boot_df
)
label_df <- plot_df %>%
  group_by(group) %>%
  summarise(
    n = n(),
    median_val = median(residuals),
    y_max = max(residuals),
    .groups = "drop"
  )

label_df <- label_df %>%
  mutate(n_label = case_when(
    group == "Random abundance-matched" ~ "bootstrapped 1000 times",
    TRUE ~ paste0("n = ", n)
  ))

# Plot
p_fast <- ggplot(plot_df, aes(x = group, y = residuals, fill = group)) +
  geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
  geom_jitter(width = 0.15, size = 2, alpha = 0.7, color = "lightgrey") +
    stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
  stat_summary(fun = median, geom = "point", shape = 23, size = 2.5,
               fill = "black", color = "black", stroke = 0.7) +

 geom_text(
  data = label_df,
  aes(x = group, y = 4.5, label = n_label),
  inherit.aes = FALSE,
  size = 4) +

  geom_text(
    data = label_df,
    aes(x = group, y = median_val + 0.25, label = sprintf(" %.2f", median_val)),
    inherit.aes = FALSE,
    size = 6
  ) +
  labs(
    x = "",
    y = "Activity-abundance residuals",
    title = "All Variants"
  ) +
  theme_classic(base_size = 14) +
  scale_fill_manual(values = c("Pathogenic" = "cornflowerblue", "Random abundance-matched" = "darkolivegreen3")) +
  theme(legend.position = "none") +
  geom_signif(comparisons = list(c("Pathogenic", "Random abundance-matched")),
              map_signif_level = FALSE,
              test = "wilcox.test",
              tip_length = 0.01)

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig4_violin1.pdf", 
       plot = p_fast, width = 3, height = 4, dpi = 300)
p_fast

patho_df <- pten_df_clean %>% distinct(mutant, Phenotype_Class, .keep_all = TRUE)
nrow(patho_df) #4839
## [1] 4839
patho_df1 <- patho_df %>% filter(Phenotype_Class == "ASD/DD")
patho_df2 <- patho_df %>% filter(Phenotype_Class == "ASD/DD & PHTS")
patho_df3 <- patho_df %>% filter(Phenotype_Class == "PHTS")

patho_df1$var_class2 <- "ASD/DD"
patho_df2$var_class2 <- "ASD/DD & PHTS"
patho_df3$var_class2 <- "PHTS"
pten_gnomad$var_class2 <- "gnomAD"
pten_somatic$var_class2 <- "somatic"
colnames(patho_df1)
##  [1] "mutant"                          "DMS_score_abundance"            
##  [3] "DMS_score_bin_abundance"         "ESM.1v"                         
##  [5] "DMS_score_activity"              "DMS_score_bin_activity"         
##  [7] "mutation_position"               "V1"                             
##  [9] "Model"                           "Dataset"                        
## [11] "ddG_pred"                        "position"                       
## [13] "wildtype"                        "mutation"                       
## [15] "pdb"                             "chain"                          
## [17] "new_position"                    "uniprot"                        
## [19] "exposure_SS"                     "exposure_rASA"                  
## [21] "spot_disorder"                   "func_esms_residue_class"        
## [23] "func_esms_variant_class"         "clinvar_clinical_significance"  
## [25] "sequence"                        "Phenotype_Class"                
## [27] "Phenotype_List"                  "Fitness_Score"                  
## [29] "Abundance_Score"                 "HGVS.Consequence"               
## [31] "ClinVar.Germline.Classification" "rsIDs"                          
## [33] "ClinVar.Variation.ID"            "gnomad"                         
## [35] "fitted"                          "residuals"                      
## [37] "somatic"                         "var_class2"
patho_df1 <- patho_df1 %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, var_class2, residuals)
patho_df2 <- patho_df2 %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, var_class2,residuals)
patho_df3 <- patho_df3 %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, var_class2,residuals)
pten_gnomad <- pten_gnomad %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, var_class2,residuals)
pten_somatic <- pten_somatic %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, var_class2,residuals)
combined_df <- rbind(patho_df1,patho_df2,patho_df3,pten_gnomad, pten_somatic)
median_df <- combined_df %>%
  group_by(var_class2) %>%
  summarise(
    median_residual = median(residuals),
    n = n()
  )
median_df
## # A tibble: 5 × 3
##   var_class2    median_residual     n
##   <chr>                   <dbl> <int>
## 1 ASD/DD                0.162      12
## 2 ASD/DD & PHTS        -1.48       10
## 3 PHTS                 -1.77       36
## 4 gnomAD               -0.00905    37
## 5 somatic              -1.52       37
#wilcox.test(patho_df1$residuals, pten_gnomad$residuals)
combined_df$var_class2 <- factor(
  combined_df$var_class2,
  levels = c("gnomAD", "somatic","ASD/DD", "ASD/DD & PHTS", "PHTS")
)

custom_colors <- c(
  "gnomAD" = "#1b9e77",   # Teal green
  "somatic"   = "#fbb4ae",
  "ASD/DD"   = "#f4a6b3",   # Warm orange
  "ASD/DD & PHTS"   = "#e7298a",   # Muted purple
  "PHTS"   = "#7570b3"    # Hot pink / magenta
)

highlight_muts <- c("C124S", "G129E", "R130G", "R130Q", "P38S", "R130P", "D92H")

p10 <- ggplot(combined_df, aes(x = var_class2, y = residuals, fill = var_class2)) +
  geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
  geom_jitter(width = 0.15, size = 2, alpha = 0.7, color = "lightgrey") +
  stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
  stat_summary(fun = median, geom = "point", shape = 23, size = 2, fill = "black", color = "black", stroke = 0.7) +
  geom_text(
  data = median_df,
  aes(x = var_class2, y = median_residual + 0.5, label = sprintf("%.2f", median_residual)),
  inherit.aes = FALSE,
  size = 5
) +
  scale_fill_manual(values = custom_colors) +
  geom_text(
    data = median_df,
    aes(x = var_class2, y = 7.5, label = paste0("n = ", n)),
    inherit.aes = FALSE,
    size = 4.5
  ) +
  labs(
    title = "Across groups",
    x = "",
    y = "Activity-abundance residual",
    fill = ""
  ) +
  theme_classic(base_size = 14) +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 20, hjust = 1) 
  ) +
    geom_signif(
    comparisons = list(c("gnomAD", "somatic"), 
                       c("gnomAD", "ASD/DD"), 
                       c("gnomAD", "ASD/DD & PHTS"),
                       c("gnomAD", "PHTS")),
    map_signif_level = FALSE,
    test = "wilcox.test",
    step_increase = 0.1,
    textsize = 3
  ) 

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig4_violin2.pdf", 
       plot = p10, width = 3, height = 4, dpi = 300)
p10

wilcox.test(patho_df1$residuals, pten_gnomad$residuals) #0.688
## 
##  Wilcoxon rank sum exact test
## 
## data:  patho_df1$residuals and pten_gnomad$residuals
## W = 235, p-value = 0.7743
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(patho_df2$residuals, pten_gnomad$residuals) #0.03351
## 
##  Wilcoxon rank sum exact test
## 
## data:  patho_df2$residuals and pten_gnomad$residuals
## W = 82, p-value = 0.006283
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(patho_df3$residuals, pten_gnomad$residuals) #9.383e-05
## 
##  Wilcoxon rank sum exact test
## 
## data:  patho_df3$residuals and pten_gnomad$residuals
## W = 299, p-value = 3.013e-05
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(patho_df1$residuals, patho_df2$residuals) #0.1693
## 
##  Wilcoxon rank sum exact test
## 
## data:  patho_df1$residuals and patho_df2$residuals
## W = 93, p-value = 0.02996
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(patho_df1$residuals, patho_df3$residuals) #0.02058
## 
##  Wilcoxon rank sum exact test
## 
## data:  patho_df1$residuals and patho_df3$residuals
## W = 336, p-value = 0.003475
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(patho_df2$residuals, patho_df3$residuals) #0.3959
## 
##  Wilcoxon rank sum exact test
## 
## data:  patho_df2$residuals and patho_df3$residuals
## W = 202, p-value = 0.5725
## alternative hypothesis: true location shift is not equal to 0
nrow(pten_gnomad) #37
## [1] 37
nrow(pten_patho) #115
## [1] 115
nrow(pten_somatic) #37
## [1] 37
pten_gnomad$var_source <- "gnomAD"
pten_patho$var_source <- "Pathogenic"
pten_somatic$var_source <- "Somatic"

pten_gnomad <- pten_gnomad %>% dplyr::select(mutant, residuals, var_source,DMS_score_abundance)
pten_patho <- pten_patho %>% dplyr::select(mutant, residuals, var_source,DMS_score_abundance)
pten_somatic <- pten_somatic  %>% dplyr::select(mutant, residuals, var_source,DMS_score_abundance)

combined_df <- rbind(pten_gnomad, pten_patho, pten_somatic)
nrow(combined_df)
## [1] 189
table(combined_df$var_source)
## 
##     gnomAD Pathogenic    Somatic 
##         37        115         37
pten_df_clean_2plot <- combined_df
median_df <- pten_df_clean_2plot %>%
  group_by(var_source) %>%
  summarise(
    median_residual = median(residuals, na.rm = TRUE),
    n = n()
  )
median_df
## # A tibble: 3 × 3
##   var_source median_residual     n
##   <chr>                <dbl> <int>
## 1 Pathogenic        -1.48      115
## 2 Somatic           -1.52       37
## 3 gnomAD            -0.00905    37
pten_df_clean_2plot$var_source <- factor(
  pten_df_clean_2plot$var_source,
  levels = c("gnomAD", "Somatic", "Pathogenic")
)
table(pten_df_clean_2plot$var_source)
## 
##     gnomAD    Somatic Pathogenic 
##         37         37        115
custom_colors <- c(
  "gnomAD"     = "#1b9e77",  # Teal green (unchanged)
  "Somatic"    = "#fbb4ae",  # Soft sky blue
  "Pathogenic" = "#de425b"   # Deep ocean blue
)


p11 <- ggplot(pten_df_clean_2plot, aes(x = var_source, y = residuals, fill = var_source)) +
  geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
  geom_jitter(width = 0.15, size = 2, alpha = 0.7, color = "lightgrey") +
  stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
  stat_summary(fun = median, geom = "point", shape = 23, size = 2, fill = "black", color = "black", stroke = 0.7) +
  geom_text(
    data = median_df,
    aes(x = var_source, y = median_residual + 1, label = sprintf("%.2f", median_residual)),
    inherit.aes = FALSE,
    size = 5
  ) +
  scale_fill_manual(values = custom_colors) +
  geom_text(
    data = median_df,
    aes(x = var_source, y = 7.5, label = paste0("n = ", n)),
    inherit.aes = FALSE,
    size = 4.5
  ) +
  labs(
    title = "All 173 Variants",
    x = "",
    y = "Activity-abundance residuals",
    fill = ""
  ) + theme_classic()+
  theme(
    legend.position = "none"
  ) +
  geom_signif(
    comparisons = list(c("gnomAD", "Somatic"), 
                       c("gnomAD", "Pathogenic"), 
                       c("Somatic", "Pathogenic")),
    map_signif_level = FALSE,
    test = "wilcox.test",
    step_increase = 0.1,
    textsize = 3
  ) 
p11

pten_df_clean_2plot <- pten_df_clean_2plot %>%
  mutate(mutation_position = as.numeric(str_extract(mutant, "(?<=\\D)(\\d+)(?=\\D)")))

pten_df_clean_2plot_int <- pten_df_clean_2plot %>% filter(mutation_position %in% all_positions)
nrow(pten_df_clean_2plot_int) #89
## [1] 89
length(unique(pten_df_clean_2plot_int$mutant)) #89
## [1] 89
median_df <- pten_df_clean_2plot_int %>%
  group_by(var_source) %>%
  summarise(
    median_residual = median(residuals, na.rm = TRUE),
    n = n()
  )
median_df
## # A tibble: 3 × 3
##   var_source median_residual     n
##   <fct>                <dbl> <int>
## 1 gnomAD             -0.0190    14
## 2 Somatic            -2.38      15
## 3 Pathogenic         -1.77      60
table(pten_df_clean_2plot_int$var_source)
## 
##     gnomAD    Somatic Pathogenic 
##         14         15         60
highlight_muts <- c("C124S", "G129E", "R130G", "R130Q", "P38S", "R130P", "D92H")
nrow(pten_df_clean_2plot_int)
## [1] 89
p12 <- ggplot(pten_df_clean_2plot_int, aes(x = var_source, y = residuals, fill = var_source)) +
  geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
  geom_jitter(width = 0.15, size = 2, alpha = 0.9, color = "lightgrey") +
  stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
  stat_summary(fun = median, geom = "point", shape = 23, size = 1, fill = "black", color = "black", stroke = 0.7) +
    geom_point(
    data = subset(pten_df_clean_2plot_int, mutant %in% highlight_muts),
    aes(x = var_source, y = residuals),
    shape = 17,
    size = 2,
    fill = "black",
    color = "darkcyan",
    stroke = 1
  ) + 
  geom_text_repel(data = subset(pten_df_clean_2plot_int, mutant %in% highlight_muts),
    aes(label=mutant),color = "darkcyan", size = 3)+
  geom_text(
    data = median_df,
    aes(x = var_source, y = median_residual + 1, label = sprintf("%.2f", median_residual)),
    inherit.aes = FALSE,
    size = 4
  ) +
  scale_fill_manual(values = custom_colors) +
  geom_text(
    data = median_df,
    aes(x = var_source, y = 7.5, label = paste0("n = ", n)),
    inherit.aes = FALSE,
    size = 4.5
  ) +
  labs(
    title = "89 Orthosteric Variants",
    x = "",
    y = "Activity-abundance residuals",
    fill = ""
  ) +
  theme_classic() +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  geom_signif(
    comparisons = list(c("gnomAD", "Somatic"), 
                       c("gnomAD", "Pathogenic"), 
                       c("Somatic", "Pathogenic")),
    map_signif_level = FALSE,
    test = "wilcox.test",
    step_increase = 0.1,
    textsize = 3
  ) + ylim(-8, 11)

p12

pten_df_clean_2plot_out <- pten_df_clean_2plot %>% filter(!mutation_position %in% all_positions)
nrow(pten_df_clean_2plot_out) #100
## [1] 100
median_df <- pten_df_clean_2plot_out %>%
  group_by(var_source) %>%
  summarise(
    median_residual = median(residuals, na.rm = TRUE),
    n = n()
  )
median_df
## # A tibble: 3 × 3
##   var_source median_residual     n
##   <fct>                <dbl> <int>
## 1 gnomAD              0.0154    23
## 2 Somatic            -1.33      22
## 3 Pathogenic         -1.01      55
p13 <- ggplot(pten_df_clean_2plot_out, aes(x = var_source, y = residuals, fill = var_source)) +
  geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
  geom_jitter(width = 0.15, size = 2, alpha = 0.9, color = "lightgrey") +
  stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
  stat_summary(fun = median, geom = "point", shape = 23, size = 1, fill = "black", color = "black", stroke = 0.7) +
    geom_point(
    data = subset(pten_df_clean_2plot_out, mutant %in% highlight_muts),
    aes(x = var_source, y = residuals),
    shape = 17,
    size = 2,
    fill = "black",
    color = "darkcyan",
    stroke = 1
  ) + 
  geom_text_repel(data = subset(pten_df_clean_2plot_out, mutant %in% highlight_muts),
    aes(label=mutant),color = "darkcyan",size = 3)+
  geom_text(
    data = median_df,
    aes(x = var_source, y = median_residual + 1, label = sprintf("%.2f", median_residual)),
    inherit.aes = FALSE,
    size = 4
  ) +
  scale_fill_manual(values = custom_colors) +
  geom_text(
    data = median_df,
    aes(x = var_source, y = 7.5, label = paste0("n = ", n)),
    inherit.aes = FALSE,
    size = 4.5
  ) +
  labs(
    title = "100 Non-orthosteric Variants",
    x = "",
    y = "Activity-abundance residuals",
    fill = ""
  ) +
  theme_classic() +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  geom_signif(
    comparisons = list(c("gnomAD", "Somatic"), 
                       c("gnomAD", "Pathogenic"), 
                       c("Somatic", "Pathogenic")),
    map_signif_level = FALSE,
    test = "wilcox.test",
    step_increase = 0.1,
    textsize = 3
  ) + ylim(-8, 11)

p13

pten_df_clean_2plot_stable <- pten_df_clean_2plot %>% filter(DMS_score_abundance > 0.71)
nrow(pten_df_clean_2plot_stable) #81
## [1] 81
table(pten_df_clean_2plot_stable$var_source)
## 
##     gnomAD    Somatic Pathogenic 
##         28         14         39
pten_df_clean_2plot_stable_int <- pten_df_clean_2plot_stable %>% filter(mutation_position %in% all_positions)
nrow(pten_df_clean_2plot_stable_int) #48
## [1] 48
median_df <- pten_df_clean_2plot_stable_int %>%
  group_by(var_source) %>%
  summarise(
    median_residual = median(residuals, na.rm = TRUE),
    n = n()
  )
median_df
## # A tibble: 3 × 3
##   var_source median_residual     n
##   <fct>                <dbl> <int>
## 1 gnomAD              -0.222     9
## 2 Somatic             -2.12      8
## 3 Pathogenic          -2.41     31
p14 <- ggplot(pten_df_clean_2plot_stable_int, aes(x = var_source, y = residuals, fill = var_source)) +
  geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
  geom_jitter(width = 0.15, size = 2, alpha = 0.9, color = "lightgrey") +
  stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
  stat_summary(fun = median, geom = "point", shape = 23, size = 1, fill = "black", color = "black", stroke = 0.7) +
  geom_text(
    data = median_df,
    aes(x = var_source, y = median_residual + 1, label = sprintf("%.2f", median_residual)),
    inherit.aes = FALSE,
    size = 5
  ) +
  scale_fill_manual(values = custom_colors) +
  geom_text(
    data = median_df,
    aes(x = var_source, y = 7.5, label = paste0("n = ", n)),
    inherit.aes = FALSE,
    size = 4
  ) +
  labs(
    title = "48 WT-abundance Orthosteric Variants",
    x = "",
    y = "LOESS fit residuals",
    fill = ""
  ) +
  theme_classic() +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  geom_signif(
    comparisons = list(c("gnomAD", "Somatic"), 
                       c("gnomAD", "Pathogenic"), 
                       c("Somatic", "Pathogenic")),
    map_signif_level = FALSE,
    test = "wilcox.test",
    step_increase = 0.1,
    textsize = 3
  ) + ylim(-8, 11)

p14
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_violin()`).

pten_df_clean_2plot_stable_out <- pten_df_clean_2plot_stable %>% filter(!mutation_position %in% all_positions)
nrow(pten_df_clean_2plot_stable_out) #33
## [1] 33
median_df <- pten_df_clean_2plot_stable_out %>%
  group_by(var_source) %>%
  summarise(
    median_residual = median(residuals, na.rm = TRUE),
    n = n()
  )
median_df
## # A tibble: 3 × 3
##   var_source median_residual     n
##   <fct>                <dbl> <int>
## 1 gnomAD             -0.0127    19
## 2 Somatic            -1.34       6
## 3 Pathogenic         -1.56       8
p15 <- ggplot(pten_df_clean_2plot_stable_out, aes(x = var_source, y = residuals, fill = var_source)) +
  geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
  geom_jitter(width = 0.15, size = 2, alpha = 0.9, color = "lightgrey") +
  stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
  stat_summary(fun = median, geom = "point", shape = 23, size = 1, fill = "black", color = "black", stroke = 0.7) +
  geom_text(
    data = median_df,
    aes(x = var_source, y = median_residual + 1, label = sprintf("%.2f", median_residual)),
    inherit.aes = FALSE,
    size = 4
  ) +
  scale_fill_manual(values = custom_colors) +
  geom_text(
    data = median_df,
    aes(x = var_source, y = 7.5, label = paste0("n = ", n)),
    inherit.aes = FALSE,
    size = 4.5
  ) +
  labs(
    title = "33 WT-abundance Non-orthosteric Variants",
    x = "",
    y = "LOESS fit residuals",
    fill = ""
  ) +
  theme_classic() +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  geom_signif(
    comparisons = list(c("gnomAD", "Somatic"), 
                       c("gnomAD", "Pathogenic"), 
                       c("Somatic", "Pathogenic")),
    map_signif_level = FALSE,
    test = "wilcox.test",
    step_increase = 0.1,
    textsize = 3
  ) + ylim(-8, 11)

p15 

range(pten_df_clean_2plot$residuals)
## [1] -5.157748  2.422716
wilcox.test(pten_df_clean_2plot_stable_int %>% filter(var_source == "gnomAD")  %>% pull(residuals),
            pten_df_clean_2plot_stable_int %>% filter(var_source == "Pathogenic")  %>% pull(residuals))
## 
##  Wilcoxon rank sum exact test
## 
## data:  pten_df_clean_2plot_stable_int %>% filter(var_source == "gnomAD") %>% pull(residuals) and pten_df_clean_2plot_stable_int %>% filter(var_source == "Pathogenic") %>% pull(residuals)
## W = 224, p-value = 0.004996
## alternative hypothesis: true location shift is not equal to 0
p16 <- plot_grid(p12,p13,ncol=2,nrow=1)
p16

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig4_violin3.pdf", 
       plot = p16, width = 5, height = 3, dpi = 300)
p17 <- plot_grid(p14,p15,ncol=2,nrow=1)
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_violin()`).
p17

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig4_violin4.pdf", 
       plot = p17, width = 6, height = 4, dpi = 300)
pdb <- read.pdb("~/Documents/0.Projects/01.protein-seq-evo-v1/data/residual_pdb/PTEN/1d5r.pdb")

# Extract C-alpha atoms from protein (exclude ligand)
protein_ca <- pdb$atom[pdb$atom$elety == "CA" & pdb$atom$resid != "TLA", ]

# Extract ligand atoms (TLA residue)
ligand_atoms <- pdb$atom[pdb$atom$resid == "TLA" & pdb$atom$type == "HETATM", ]

# Calculate min distance from each C-alpha to the ligand
protein_ca$min_dist_to_ligand <- apply(protein_ca, 1, function(atom) {
  dists <- sqrt((as.numeric(atom[["x"]]) - as.numeric(ligand_atoms$x))^2 +
                  (as.numeric(atom[["y"]]) - as.numeric(ligand_atoms$y))^2 +
                  (as.numeric(atom[["z"]]) - as.numeric(ligand_atoms$z))^2)
  return(min(dists))
})

# View summary
nrow(protein_ca) #307
## [1] 307
summary(protein_ca$min_dist_to_ligand)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.353  14.371  19.765  19.785  25.267  37.348
head(protein_ca$min_dist_to_ligand)
## [1] 18.59907 17.19423 14.38261 15.13317 17.71047 20.54182
length(protein_ca$min_dist_to_ligand) #307
## [1] 307
fil_protein_ca <- protein_ca %>% dplyr::select(resid, resno, min_dist_to_ligand)
merged_df <- merge(pten_df_clean, fil_protein_ca, by.x="mutation_position", by.y = "resno", all.x = TRUE)
nrow(merged_df)
## [1] 4839
head(merged_df)
##   mutation_position mutant DMS_score_abundance DMS_score_bin_abundance
## 1                 2    T2A           0.7755378                       1
## 2                 2    T2D           1.1906667                       1
## 3                 2    T2K           0.9446276                       1
## 4                 2    T2N           1.1539191                       1
## 5                 2    T2P           0.8433099                       1
## 6                 2    T2R           0.9024487                       1
##       ESM.1v DMS_score_activity DMS_score_bin_activity V1      Model
## 1 -0.0355402        2.275491838                      1 20 ThermoMPNN
## 2 -9.5293701        2.387178001                      1 22 ThermoMPNN
## 3 -7.9349979       -0.009668751                      1 28 ThermoMPNN
## 4 -7.4877127       -3.204991850                      0 31 ThermoMPNN
## 5 -6.1493063       -0.257056982                      1 32 ThermoMPNN
## 6 -9.1065905        0.551584254                      1 34 ThermoMPNN
##                 Dataset    ddG_pred position wildtype mutation
## 1 AF-P60484-F1-model_v2  0.06013024        1        T        A
## 2 AF-P60484-F1-model_v2 -0.01170939        1        T        D
## 3 AF-P60484-F1-model_v2  0.19344133        1        T        K
## 4 AF-P60484-F1-model_v2  0.06487387        1        T        N
## 5 AF-P60484-F1-model_v2  0.16262442        1        T        P
## 6 AF-P60484-F1-model_v2  0.20207280        1        T        R
##                     pdb chain new_position uniprot exposure_SS exposure_rASA
## 1 AF-P60484-F1-model_v2     A            2  P60484           H          0.57
## 2 AF-P60484-F1-model_v2     A            2  P60484           H          0.57
## 3 AF-P60484-F1-model_v2     A            2  P60484           H          0.57
## 4 AF-P60484-F1-model_v2     A            2  P60484           H          0.57
## 5 AF-P60484-F1-model_v2     A            2  P60484           H          0.57
## 6 AF-P60484-F1-model_v2     A            2  P60484           H          0.57
##   spot_disorder func_esms_residue_class func_esms_variant_class
## 1             1                       1                       0
## 2             1                       1                       1
## 3             1                       1                       0
## 4             1                       1                       0
## 5             1                       1                       0
## 6             1                       1                       1
##   clinvar_clinical_significance
## 1                              
## 2                              
## 3                              
## 4                              
## 5                              
## 6                              
##                                                                                                                                                                                                                                                                                                                                                                                                              sequence
## 1 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 2 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 3 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 4 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 5 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 6 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
##   Phenotype_Class Phenotype_List Fitness_Score Abundance_Score HGVS.Consequence
## 1            <NA>           <NA>            NA              NA             <NA>
## 2            <NA>           <NA>            NA              NA             <NA>
## 3            <NA>           <NA>            NA              NA             <NA>
## 4            <NA>           <NA>            NA              NA             <NA>
## 5            <NA>           <NA>            NA              NA             <NA>
## 6            <NA>           <NA>            NA              NA             <NA>
##   ClinVar.Germline.Classification rsIDs ClinVar.Variation.ID gnomad     fitted
## 1                            <NA>  <NA>                   NA   <NA> -0.1669595
## 2                            <NA>  <NA>                   NA   <NA> -0.1979372
## 3                            <NA>  <NA>                   NA   <NA> -0.1787797
## 4                            <NA>  <NA>                   NA   <NA> -0.1899017
## 5                            <NA>  <NA>                   NA   <NA> -0.1742966
## 6                            <NA>  <NA>                   NA   <NA> -0.1759730
##     residuals somatic resid min_dist_to_ligand
## 1  2.44245136    <NA>  <NA>                 NA
## 2  2.58511520    <NA>  <NA>                 NA
## 3  0.16911091    <NA>  <NA>                 NA
## 4 -3.01509015    <NA>  <NA>                 NA
## 5 -0.08276039    <NA>  <NA>                 NA
## 6  0.72755725    <NA>  <NA>                 NA
#merged_df <- merged_df %>% dplyr::select(-sequence)
#nrow(merged_df)

merged_df <- merged_df %>% filter(residuals <= 0)
nrow(merged_df) #2449
## [1] 2449
all_positions <- c(
  88:98,    # WPD loop
  160:171,  # TI loop
  123:130,  # P loop
  35:49,    # Arginine loop
  200:212,  # CBR1
  226:238,  # CBR2
  258:268,  # CBR3
  327:335,   # Cα2 loop
  151:174, # membrane-binding alpha-helix
  1:15 # PBM motif 
)

active_positions <- c(
  88:98,    # WPD loop
  159:171,  # TI loop
  123:130,  # P loop
  35:49   # Arginine loop
)

binding_positions <- c(
  200:212,  # CBR1
  226:238,  # CBR2
  258:268,  # CBR3
  327:335,   # Cα2 loop
  151:174, # membrane-binding alpha-helix
  1:15 # PBM motif 
)
merged_df <- merge(pten_df_clean, fil_protein_ca, by.x="mutation_position", by.y = "resno", all.x = TRUE)
#merged_df <- merged_df %>% dplyr::select(-sequence)
nrow(merged_df) #4839
## [1] 4839
head(merged_df)
##   mutation_position mutant DMS_score_abundance DMS_score_bin_abundance
## 1                 2    T2A           0.7755378                       1
## 2                 2    T2D           1.1906667                       1
## 3                 2    T2K           0.9446276                       1
## 4                 2    T2N           1.1539191                       1
## 5                 2    T2P           0.8433099                       1
## 6                 2    T2R           0.9024487                       1
##       ESM.1v DMS_score_activity DMS_score_bin_activity V1      Model
## 1 -0.0355402        2.275491838                      1 20 ThermoMPNN
## 2 -9.5293701        2.387178001                      1 22 ThermoMPNN
## 3 -7.9349979       -0.009668751                      1 28 ThermoMPNN
## 4 -7.4877127       -3.204991850                      0 31 ThermoMPNN
## 5 -6.1493063       -0.257056982                      1 32 ThermoMPNN
## 6 -9.1065905        0.551584254                      1 34 ThermoMPNN
##                 Dataset    ddG_pred position wildtype mutation
## 1 AF-P60484-F1-model_v2  0.06013024        1        T        A
## 2 AF-P60484-F1-model_v2 -0.01170939        1        T        D
## 3 AF-P60484-F1-model_v2  0.19344133        1        T        K
## 4 AF-P60484-F1-model_v2  0.06487387        1        T        N
## 5 AF-P60484-F1-model_v2  0.16262442        1        T        P
## 6 AF-P60484-F1-model_v2  0.20207280        1        T        R
##                     pdb chain new_position uniprot exposure_SS exposure_rASA
## 1 AF-P60484-F1-model_v2     A            2  P60484           H          0.57
## 2 AF-P60484-F1-model_v2     A            2  P60484           H          0.57
## 3 AF-P60484-F1-model_v2     A            2  P60484           H          0.57
## 4 AF-P60484-F1-model_v2     A            2  P60484           H          0.57
## 5 AF-P60484-F1-model_v2     A            2  P60484           H          0.57
## 6 AF-P60484-F1-model_v2     A            2  P60484           H          0.57
##   spot_disorder func_esms_residue_class func_esms_variant_class
## 1             1                       1                       0
## 2             1                       1                       1
## 3             1                       1                       0
## 4             1                       1                       0
## 5             1                       1                       0
## 6             1                       1                       1
##   clinvar_clinical_significance
## 1                              
## 2                              
## 3                              
## 4                              
## 5                              
## 6                              
##                                                                                                                                                                                                                                                                                                                                                                                                              sequence
## 1 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 2 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 3 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 4 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 5 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
## 6 MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
##   Phenotype_Class Phenotype_List Fitness_Score Abundance_Score HGVS.Consequence
## 1            <NA>           <NA>            NA              NA             <NA>
## 2            <NA>           <NA>            NA              NA             <NA>
## 3            <NA>           <NA>            NA              NA             <NA>
## 4            <NA>           <NA>            NA              NA             <NA>
## 5            <NA>           <NA>            NA              NA             <NA>
## 6            <NA>           <NA>            NA              NA             <NA>
##   ClinVar.Germline.Classification rsIDs ClinVar.Variation.ID gnomad     fitted
## 1                            <NA>  <NA>                   NA   <NA> -0.1669595
## 2                            <NA>  <NA>                   NA   <NA> -0.1979372
## 3                            <NA>  <NA>                   NA   <NA> -0.1787797
## 4                            <NA>  <NA>                   NA   <NA> -0.1899017
## 5                            <NA>  <NA>                   NA   <NA> -0.1742966
## 6                            <NA>  <NA>                   NA   <NA> -0.1759730
##     residuals somatic resid min_dist_to_ligand
## 1  2.44245136    <NA>  <NA>                 NA
## 2  2.58511520    <NA>  <NA>                 NA
## 3  0.16911091    <NA>  <NA>                 NA
## 4 -3.01509015    <NA>  <NA>                 NA
## 5 -0.08276039    <NA>  <NA>                 NA
## 6  0.72755725    <NA>  <NA>                 NA
sum(!is.na(merged_df$min_dist_to_ligand)) #3738
## [1] 3738
merged_df <- merged_df %>% filter(!is.na(min_dist_to_ligand))

patho_df1 <- merged_df %>% filter (mutant %in% patho_df1$mutant)
nrow(patho_df1) #11
## [1] 11
patho_df2 <- merged_df %>% filter (mutant %in% patho_df2$mutant)
nrow(patho_df2) #11
## [1] 10
patho_df3 <- merged_df %>% filter (mutant %in% patho_df3$mutant)
nrow(patho_df3) #34
## [1] 33
pten_gnomad <- merged_df %>% filter (mutant %in% pten_gnomad$mutant)
nrow(pten_gnomad) #28
## [1] 28
pten_somatic <- merged_df %>% filter (mutant %in% pten_somatic$mutant)
nrow(pten_somatic) #37
## [1] 37
patho_df1$var_class2 <- "ASD/DD"
patho_df2$var_class2 <- "ASD/DD & PHTS"
patho_df3$var_class2 <- "PHTS"
pten_gnomad$var_class2 <- "gnomAD"
pten_somatic$var_class2 <- "somatic"
colnames(pten_gnomad)
##  [1] "mutation_position"               "mutant"                         
##  [3] "DMS_score_abundance"             "DMS_score_bin_abundance"        
##  [5] "ESM.1v"                          "DMS_score_activity"             
##  [7] "DMS_score_bin_activity"          "V1"                             
##  [9] "Model"                           "Dataset"                        
## [11] "ddG_pred"                        "position"                       
## [13] "wildtype"                        "mutation"                       
## [15] "pdb"                             "chain"                          
## [17] "new_position"                    "uniprot"                        
## [19] "exposure_SS"                     "exposure_rASA"                  
## [21] "spot_disorder"                   "func_esms_residue_class"        
## [23] "func_esms_variant_class"         "clinvar_clinical_significance"  
## [25] "sequence"                        "Phenotype_Class"                
## [27] "Phenotype_List"                  "Fitness_Score"                  
## [29] "Abundance_Score"                 "HGVS.Consequence"               
## [31] "ClinVar.Germline.Classification" "rsIDs"                          
## [33] "ClinVar.Variation.ID"            "gnomad"                         
## [35] "fitted"                          "residuals"                      
## [37] "somatic"                         "resid"                          
## [39] "min_dist_to_ligand"              "var_class2"
patho_df1 <- patho_df1 %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, var_class2, residuals,min_dist_to_ligand)
patho_df2 <- patho_df2 %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, var_class2,residuals,min_dist_to_ligand)
patho_df3 <- patho_df3 %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, var_class2,residuals,min_dist_to_ligand)
pten_gnomad <- pten_gnomad %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, var_class2,residuals,min_dist_to_ligand)
pten_somatic <- pten_somatic %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, var_class2,residuals,min_dist_to_ligand)

combined_df <- rbind(patho_df1,patho_df2,patho_df3,pten_gnomad, pten_somatic)
nrow(combined_df)
## [1] 119
median_df <- combined_df %>%
  group_by(var_class2) %>%
  summarise(
    median_dist = median(min_dist_to_ligand, na.rm = TRUE),
    n = n()
  )
median_df
## # A tibble: 5 × 3
##   var_class2    median_dist     n
##   <chr>               <dbl> <int>
## 1 ASD/DD               17.6    11
## 2 ASD/DD & PHTS        13.6    10
## 3 PHTS                 14.7    33
## 4 gnomAD               20.4    28
## 5 somatic              15.0    37
head(combined_df)
##   mutant DMS_score_abundance DMS_score_activity var_class2  residuals
## 1   R14G         1.041911071         -0.7107802     ASD/DD -0.5318023
## 2   Q17E         0.971256096         -0.5467923     ASD/DD -0.3682351
## 3   H61Y         0.941901563         -0.5830217     ASD/DD -0.4042427
## 4   Y68C        -0.008473303         -2.7323465     ASD/DD  0.4783478
## 5   P95T         0.155054169         -0.4500599     ASD/DD  1.9428391
## 6  L139F        -0.126631454         -0.4326210     ASD/DD  3.3608216
##   min_dist_to_ligand
## 1           18.59907
## 2           15.13317
## 3           24.22288
## 4           12.59897
## 5            7.55193
## 6           17.61638
combined_df$var_class2 <- factor(
  combined_df$var_class2,
  levels = c("gnomAD", "somatic", "ASD/DD", "ASD/DD & PHTS", "PHTS")
)

custom_colors <- c(
  "gnomAD"     = "#1b9e77",  # Teal green (unchanged)
  "somatic"    = "#fbb4ae",  # Soft sky blue
  "ASD/DD"   = "#f4a6b3",   # Warm orange
  "ASD/DD & PHTS"   = "#e7298a",   # Muted purple
  "PHTS"   = "#7570b3"    # Hot pink / magenta
)

active_positions <- c(
  88:98,    # WPD loop
  159:171,  # TI loop
  35:49,    # Arginine loop
  123:130  # P loop
)

binding_positions <- c(
  200:212,  # CBR1
  226:238,  # CBR2
  258:268,  # CBR3
  327:335,   # Cα2 loop
  151:174, # membrane-binding alpha-helix
  1:15 # PBM motif 
)

combined_df <- combined_df %>%
  mutate(mutation_position = as.numeric(str_extract(mutant, "(?<=\\D)(\\d+)(?=\\D)")))

combined_df$site_type <- "Non-orthosteric site"
combined_df$site_type[combined_df$mutation_position %in% active_positions] <- "Active site"
combined_df$site_type[combined_df$mutation_position %in% binding_positions] <- "Binding site"
table(combined_df$site_type)
## 
##          Active site         Binding site Non-orthosteric site 
##                   26                   22                   71
p1 <- ggplot(combined_df, aes(x = var_class2, y = min_dist_to_ligand, fill = var_class2)) +
  geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
 geom_jitter(aes(shape = site_type), width = 0.15, size = 2.5, alpha = 0.8,
            fill = "lightgrey", color = "darkgrey", stroke = 0.5)+
  stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
  stat_summary(fun = median, geom = "point", shape = 23, size = 2, fill = "black", color = "black", stroke = 0.7) +
  geom_text(
    data = median_df,
    aes(x = var_class2, y =  median_dist + 1, label = sprintf("%.2f",  median_dist)),
    inherit.aes = FALSE,
    size = 5
  ) +
  scale_fill_manual(values = custom_colors, guide = "none") +
  scale_shape_manual(values = c(
  "Non-orthosteric site" = 21,
  "Active site" = 22,
  "Binding site" = 23
)) +
  geom_text(
    data = median_df,
    aes(x = var_class2, y = 40, label = paste0("n = ", n)),
    inherit.aes = FALSE,
    size = 4.5
  ) +
  labs(
    title = "119 Variants",
    x = "",
    y = "Distance to TLA",
    fill = "",
    shape = "Site Type"
  ) +
  theme_classic() +
  theme(
    legend.position = "right",
    axis.text.x = element_text(angle = 45, hjust = 1)

  ) +
  geom_signif(
    comparisons = list(c("gnomAD", "somatic"), 
                       c("gnomAD", "ASD/DD"), 
                       c("gnomAD", "ASD/DD & PHTS"),
                       c("gnomAD", "PHTS")),
    map_signif_level = FALSE,
    test = "wilcox.test",
    step_increase = 0.1,
    textsize = 3
  ) 
p1
## Warning in wilcox.test.default(c(18.8445924869709, 18.8445924869709,
## 13.872890181934, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(18.8445924869709, 18.8445924869709,
## 13.872890181934, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(18.8445924869709, 18.8445924869709,
## 13.872890181934, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(18.8445924869709, 18.8445924869709,
## 13.872890181934, : cannot compute exact p-value with ties

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig4_violin_distance.pdf", 
       plot = p1, width = 6, height = 4, dpi = 300)
## Warning in wilcox.test.default(c(18.8445924869709, 18.8445924869709,
## 13.872890181934, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(18.8445924869709, 18.8445924869709,
## 13.872890181934, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(18.8445924869709, 18.8445924869709,
## 13.872890181934, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(18.8445924869709, 18.8445924869709,
## 13.872890181934, : cannot compute exact p-value with ties
p1
## Warning in wilcox.test.default(c(18.8445924869709, 18.8445924869709,
## 13.872890181934, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(18.8445924869709, 18.8445924869709,
## 13.872890181934, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(18.8445924869709, 18.8445924869709,
## 13.872890181934, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(18.8445924869709, 18.8445924869709,
## 13.872890181934, : cannot compute exact p-value with ties

merged_df <- merge(pten_df_clean, fil_protein_ca, by.x="mutation_position", by.y = "resno", all.x = TRUE)
#merged_df <- merged_df %>% dplyr::select(-sequence)

merged_df <- merged_df %>% filter(!is.na(min_dist_to_ligand))
merged_df_neg <- merged_df %>% filter(residuals <= 0)
nrow(merged_df)
## [1] 3738
merged_df_residue <- merged_df_neg %>%
  group_by(mutation_position) %>%
  summarise(
    loess_residual_avg = median(residuals, na.rm = TRUE))
    
merged_df_residue <- merge(merged_df_residue, protein_ca, by.x="mutation_position", by.y = "resno", all.x = TRUE)
    
merged_df_residue <- merged_df_residue[!is.na(merged_df_residue$min_dist_to_ligand), ]
nrow(merged_df_residue) #280
## [1] 280
# Fit exponential decay: y = a * exp(-b * x) + c
exp_model <- nls(abs(loess_residual_avg) ~ a * exp(-b * min_dist_to_ligand),
                 data = merged_df_residue,
                 start = list(a = 1, b = 0.1))
exp_model
## Nonlinear regression model
##   model: abs(loess_residual_avg) ~ a * exp(-b * min_dist_to_ligand)
##    data: merged_df_residue
##       a       b 
## 2.54954 0.04794 
##  residual sum-of-squares: 152.4
## 
## Number of iterations to convergence: 6 
## Achieved convergence tolerance: 3.505e-06
# Nonlinear regression model
#   model: abs(loess_residual_avg) ~ a * exp(-b * min_dist_to_ligand)
#    data: merged_df_residue
#       a       b 
# 2.54954 0.04794 
#  residual sum-of-squares: 152.4
# 
# Number of iterations to convergence: 6 
# Achieved convergence tolerance: 3.505e-06
a <- coef(exp_model)[["a"]]
b <- coef(exp_model)[["b"]]

merged_df_neg <- merged_df_neg %>%
  mutate(expected_res = a * exp(-b * min_dist_to_ligand),
         residual = abs(residuals) - expected_res)

# One-sided Wilcoxon signed-rank test for each residue
wilcox_test_results <- merged_df_neg %>%
  group_by(mutation_position) %>%
  summarise(
    p_wilcox = tryCatch(wilcox.test(residual, mu = 0, alternative = "greater")$p.value, error = function(e) NA_real_),
    .groups = "drop"
  ) %>%
  mutate(p_adj = p.adjust(p_wilcox, method = "fdr")) 

merged_df_residue <- left_join(merged_df_residue, wilcox_test_results, by = "mutation_position") %>%
  mutate(
    hotspot = !is.na(p_adj) & p_adj < 0.05,
    hotspot_label = ifelse(hotspot, as.character(mutation_position), NA)
  )

merged_df_residue %>% filter(hotspot == TRUE) %>% pull(mutation_position)
##  [1]  15  23  25  35  38  56  57  61  92 130 159 165 171
#115  23  25  35  38  56  57  61  92 130 159 165 171
half_d <- log(2) / b  
half_d #14.45864
## [1] 14.45815
#y_cutoff <- a  * exp(- b * half_d)
#y_cutoff

x_vals <- seq(min(merged_df_residue$min_dist_to_ligand),
              max(merged_df_residue$min_dist_to_ligand), length.out = 200)

# --- Bootstrapping for confidence intervals ---
set.seed(11)
boot_params <- replicate(1000, {
  samp <- merged_df_residue[sample(nrow(merged_df_residue), replace = TRUE), ]
  fit <- try(nlsLM(abs(loess_residual_avg) ~ a * exp(-b * min_dist_to_ligand),
                   data = samp, start = list(a = 1, b = 0.1)), silent = TRUE)
  if (inherits(fit, "try-error")) c(NA, NA) else coef(fit)
})
boot_params <- t(boot_params)[complete.cases(t(boot_params)), ]

boot_preds <- apply(boot_params, 1, function(p) p[1] * exp(-p[2] * x_vals))

fit_df_residue <- data.frame(
  min_dist_to_ligand = x_vals,
  loess_residual_avg = predict(exp_model, newdata = data.frame(min_dist_to_ligand = x_vals)),
  lower = apply(boot_preds, 1, quantile, probs = 0.025),
  upper = apply(boot_preds, 1, quantile, probs = 0.975)
)


nrow(merged_df_residue) #280
## [1] 280
merged_df_residue$site_type <- "Non-orthosteric site"
#merged_df_residue$site_type[abs(merged_df_residue$loess_residual_avg) <= y_cutoff] <- "Null"
merged_df_residue$site_type[merged_df_residue$mutation_position %in% binding_positions] <- "Binding site"
merged_df_residue$site_type[merged_df_residue$mutation_position %in% active_positions] <- "Active site"

table(merged_df_residue$site_type)
## 
##          Active site         Binding site Non-orthosteric site 
##                   45                   52                  183
max(merged_df_residue %>% filter (site_type == "Active site") %>% pull(min_dist_to_ligand)) #16.42677
## [1] 16.42677
# Plot with fitted curve
p23 <- ggplot(merged_df_residue, aes(x = min_dist_to_ligand, y = abs(loess_residual_avg))) +
  geom_point(aes(color = site_type), size = 1) +
  # First, plot NULL points separately with alpha = 0.2
  #geom_point(data = subset(merged_df_residue, site_type == "Null"),
  #           aes(color = site_type), size = 1, alpha = 0.7) +
  # Then plot the rest
  #geom_point(data = subset(merged_df_residue, site_type != "Null"),
  #           aes(color = site_type), size = 1) +
  #geom_text_repel(data = subset(merged_df_residue, mutation_position %in% c(123:130)) ,
  #                aes(label = mutation_position, color = site_type), vjust = -0.5, size = 3) +
  #geom_text_repel(data = subset(merged_df_residue, mutation_position %in% c(1:14)) ,
  #                aes(label = mutation_position, color = site_type), vjust = -0.5, size = 3) +
  geom_text_repel(data = merged_df_residue %>% filter(hotspot) %>% filter (site_type != "Binding site"), aes(label = hotspot_label,color = site_type), size = 3.5, max.overlaps = 50) +
  #geom_text_repel(data = merged_df_residue %>% filter (abs(loess_residual_avg) >= y_cutoff) %>% filter (min_dist_to_ligand > 16.42677),
  #                aes(label = mutation_position, color = site_type), vjust = 0.5, size = 3) +
  geom_ribbon(data = fit_df_residue,
            aes(x = min_dist_to_ligand, ymin = lower, ymax = upper),
            fill = "grey70", alpha = 0.3, inherit.aes = FALSE) +
  geom_line(data = fit_df_residue, aes(x = min_dist_to_ligand, y = abs(loess_residual_avg)),
          inherit.aes = FALSE, color = "black", size = 1) +
  scale_color_manual(values = c(
    "Null" = "grey",
    "Non-orthosteric site" = "darkgreen",
    "Binding site" = "cyan",
    "Active site" = "orange"
  )) +
  theme_classic() +
  geom_vline(xintercept = 16.42677, linetype = "dashed", color = "slategrey") +
  #geom_hline(yintercept = y_cutoff, linetype = "dashed", color = "slategrey") +
  labs(
    title = "PTEN: per-residue allosteric decay",
    subtitle = "280 residues",
    x = "Minimal distance to TLA",
    y = "|Median activity-abundance residual|",
    color = ""
  )  + theme(legend.position = "none")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
    # annotate("text",  x = Inf, y = Inf,
    #          hjust = 1, vjust = 1,
    #        label = "y = a * exp (b * x)\na = 2.54954 \nb = - 0.04794",
    #        size = 4, color = "black", hjust = 0) 


p23 <- ggMarginal(
  p23,
  type = "density",
  margins = "both",
  groupColour = FALSE,
  groupFill = FALSE,
  size = 10,
  colour = "grey",
  fill = "lightgrey"
)
ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig4_decay1.pdf", 
       plot = p23, width = 5, height = 5, dpi = 300)
p23

lm_model <- lm(log(abs(loess_residual_avg)) ~ min_dist_to_ligand, data = merged_df_residue)
summary(lm_model)
## 
## Call:
## lm(formula = log(abs(loess_residual_avg)) ~ min_dist_to_ligand, 
##     data = merged_df_residue)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2854 -0.5895  0.0866  0.6599  1.5076 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         0.405279   0.134745   3.008  0.00287 ** 
## min_dist_to_ligand -0.033466   0.006372  -5.252 2.99e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8119 on 278 degrees of freedom
## Multiple R-squared:  0.09027,    Adjusted R-squared:  0.087 
## F-statistic: 27.59 on 1 and 278 DF,  p-value: 2.992e-07
# Call:
# lm(formula = log(abs(loess_residual_avg)) ~ min_dist_to_ligand, 
#     data = merged_df_residue)
# 
# Residuals:
#     Min      1Q  Median      3Q     Max 
# -3.2854 -0.5895  0.0866  0.6599  1.5076 
# 
# Coefficients:
#                     Estimate Std. Error t value Pr(>|t|)    
# (Intercept)         0.405279   0.134745   3.008  0.00287 ** 
# min_dist_to_ligand -0.033466   0.006372  -5.252 2.99e-07 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# Residual standard error: 0.8119 on 278 degrees of freedom
# Multiple R-squared:  0.09027, Adjusted R-squared:  0.087 
# F-statistic: 27.59 on 1 and 278 DF,  p-value: 2.992e-07
merged_df <- merge(pten_df_clean, fil_protein_ca, by.x="mutation_position", by.y = "resno", all.x = TRUE)
#merged_df <- merged_df %>% dplyr::select(-sequence)

merged_df <- merged_df %>% filter(!is.na(min_dist_to_ligand))
merged_df_neg <- merged_df %>% filter(residuals <= 0)
nrow(merged_df_neg) #2003
## [1] 2003
# Fit exponential decay: y = a * exp(-b * x) + c
exp_model <- nls(abs(residuals) ~ a * exp(-b * min_dist_to_ligand),
                 data = merged_df_neg,
                 start = list(a = 1, b = 0.1))
exp_model
## Nonlinear regression model
##   model: abs(residuals) ~ a * exp(-b * min_dist_to_ligand)
##    data: merged_df_neg
##       a       b 
## 2.99859 0.04776 
##  residual sum-of-squares: 2231
## 
## Number of iterations to convergence: 6 
## Achieved convergence tolerance: 9.093e-07
# Nonlinear regression model
#   model: abs(residuals) ~ a * exp(-b * min_dist_to_ligand)
#    data: merged_df
#       a       b 
# 2.99859 0.04776 
#  residual sum-of-squares: 2231
# 
# Number of iterations to convergence: 6 
# Achieved convergence tolerance: 9.093e-07
a <- coef(exp_model)[["a"]]
b <- coef(exp_model)[["b"]]

merged_df_neg <- merged_df_neg %>%
  mutate(
    expected_res = a * exp(-b * min_dist_to_ligand),
    observed_abs_res = abs(residuals),
    interpolated_upper = approx(x = fit_df_residue$min_dist_to_ligand,
                                y = fit_df_residue$upper,
                                xout = min_dist_to_ligand,
                                rule = 2)$y,
    sig_above_curve = observed_abs_res > interpolated_upper
  )

# 4. Get mutations significantly above the curve
significant_mutations <- merged_df_neg %>%
  filter(sig_above_curve) 

# Preview
#significant_mutations %>% pull(mutant)
half_d <- log(2) / b  
half_d #14.51313
## [1] 14.51359
#y_cutoff <- a  * exp(-b    * half_d)

merged_df_neg$site_type <- "Non-orthosteric site"
#merged_df$site_type[abs(merged_df$residuals) <= y_cutoff] <- "Null"
merged_df_neg$site_type[merged_df_neg$mutation_position %in% active_positions] <- "Active site"
merged_df_neg$site_type[merged_df_neg$mutation_position %in% binding_positions] <- "Binding site"

x_vals <- seq(min(merged_df_neg$min_dist_to_ligand),
              max(merged_df_neg$min_dist_to_ligand), length.out = 200)

# --- Bootstrapping for confidence intervals ---
set.seed(11)
boot_params <- replicate(1000, {
  samp <- merged_df_neg[sample(nrow(merged_df_neg), replace = TRUE), ]
  fit <- try(nlsLM(abs(residuals) ~ a * exp(-b * min_dist_to_ligand),
                   data = samp, start = list(a = 1, b = 0.1)), silent = TRUE)
  if (inherits(fit, "try-error")) c(NA, NA) else coef(fit)
})
boot_params <- t(boot_params)[complete.cases(t(boot_params)), ]

boot_preds <- apply(boot_params, 1, function(p) p[1] * exp(-p[2] * x_vals))

fit_df_residue <- data.frame(
  min_dist_to_ligand = x_vals,
  residuals = predict(exp_model, newdata = data.frame(min_dist_to_ligand = x_vals)),
  lower = apply(boot_preds, 1, quantile, probs = 0.025),
  upper = apply(boot_preds, 1, quantile, probs = 0.975)
)

merged_df_neg$pathogenic_status <- ifelse(
  merged_df_neg$clinvar_clinical_significance %in% c("pathogenic", "likely_pathogenic"),
  "Pathogenic", "Other"
)

table(merged_df_neg$pathogenic_status)
## 
##      Other Pathogenic 
##       1916         87
# Plot
p21 <- ggplot(merged_df_neg, aes(x = min_dist_to_ligand, y = abs(residuals))) +
  geom_point(aes(color = site_type, alpha = pathogenic_status, shape = pathogenic_status), size = 2) +
  geom_ribbon(data = fit_df_residue,
              aes(x = min_dist_to_ligand, ymin = lower, ymax = upper),
              fill = "grey70", alpha = 0.3, inherit.aes = FALSE) +
  geom_line(data = fit_df_residue,
            aes(x = min_dist_to_ligand, y = residuals),
            inherit.aes = FALSE, color = "black", size = 1) +
  geom_text_repel(
    data = merged_df_neg %>% filter(sig_above_curve) %>% filter(pathogenic_status == "Pathogenic") %>% filter(site_type != "Binding site"),
    aes(label = mutant, color = site_type),
    size = 3, max.overlaps = Inf, box.padding = 0.4, point.padding = 0.3
  ) +
  scale_color_manual(values = c(
    "Non-orthosteric site" = "darkgreen",
    "Binding site" = "cyan",
    "Active site" = "orange"
  )) +
  scale_alpha_manual(values = c("Other" = 0.1, "Pathogenic" = 1)) +
  geom_vline(xintercept = 16.42677, linetype = "dashed", color = "slategrey") +
  theme_classic() +
  labs(
    title = "Per-mutation Allosteric Decay",
    subtitle = paste(nrow(merged_df_neg), "mutations with positive residuals"),
    x = "Minimal distance to active sites (TLA)",
    y = "|LOESS residual|",
    color = ""
  ) +
  theme(legend.position = "none") +
  guides(color = "none", alpha = "none")

p21 <- ggMarginal(
  p21,
  type = "density",
  margins = "both",
  groupColour = FALSE,
  groupFill = FALSE,
  size = 10,
  colour = "grey",
  fill = "lightgrey"
)

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig4_decay2.pdf", 
       plot = p21, width = 5, height = 5, dpi = 300)
p21

p_legend <- ggplot(merged_df_neg, aes(x = min_dist_to_ligand, y = abs(residuals))) +
  geom_point(aes(color = site_type, alpha = pathogenic_status, shape = pathogenic_status), size = 2) +
  geom_ribbon(data = fit_df_residue,
              aes(x = min_dist_to_ligand, ymin = lower, ymax = upper),
              fill = "grey70", alpha = 0.3, inherit.aes = FALSE) +
  geom_line(data = fit_df_residue,
            aes(x = min_dist_to_ligand, y = residuals),
            inherit.aes = FALSE, color = "black", size = 1) +
  geom_text_repel(
    data = merged_df_neg %>% filter(sig_above_curve) %>% filter(pathogenic_status == "Pathogenic") %>% filter(site_type != "Binding site"),
    aes(label = mutant, color = site_type),
    size = 3, max.overlaps = Inf, box.padding = 0.4, point.padding = 0.3
  ) +
  scale_color_manual(values = c(
    "Non-orthosteric site" = "darkgreen",
    "Binding site" = "cyan",
    "Active site" = "orange"
  )) +
  scale_alpha_manual(values = c("Other" = 0.1, "Pathogenic" = 1)) +
  geom_vline(xintercept = 16.42677, linetype = "dashed", color = "slategrey") +
  theme_classic() +
  labs(
    title = "Per-mutation Allosteric Decay",
    subtitle = paste(nrow(merged_df_neg), "mutations with positive residuals"),
    x = "Minimal distance to active sites (TLA)",
    y = "|LOESS residual|",
    color = ""
  ) +
  theme(legend.position = "bottom") +
  guides(color = "none", alpha = "none") 

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig4_decay_legend.pdf", 
       plot = p_legend, width = 5, height = 5, dpi = 300)
# merged_df_neg$pathogenic_status <- ifelse(
#   merged_df_neg$clinvar_clinical_significance %in% c("pathogenic", "likely_pathogenic"),
#   "Pathogenic", "Other"
# )
# 
# merged_df_neg$curve_residual <- merged_df_neg$observed_abs_res - merged_df_neg$expected_res
# 
# merged_df_neg_patho_nonortho <- merged_df_neg %>% filter(pathogenic_status == "Pathogenic") %>%
#   filter(site_type == "Non-orthosteric site")
# nrow(merged_df_neg_patho_nonortho) #57
# 
# merged_df_neg_non_patho_nonortho <- merged_df_neg %>% filter(pathogenic_status != "Pathogenic") %>%
#   filter(site_type == "Non-orthosteric site")
# nrow(merged_df_neg_non_patho_nonortho) #1232
# 
# wilcox.test(merged_df_neg_patho_nonortho$curve_residual, merged_df_neg_non_patho_nonortho$curve_residual)
# # 1. Combine data
# plot_df <- bind_rows(
#   merged_df_neg_patho_nonortho %>%
#     mutate(group = "Pathogenic"),
#   merged_df_neg_non_patho_nonortho %>%
#     mutate(group = "Non-pathogenic")
# )
# 
# # 2. Summary stats for labeling
# label_df <- plot_df %>%
#   group_by(group) %>%
#   summarise(
#     n_label = paste0("n = ", n()),
#     median_val = median(curve_residual, na.rm = TRUE)
#   )
# plot_df$group <- factor(plot_df$group, levels = c("Pathogenic", "Non-pathogenic"))
# 
# # 3. Plot
# p_violin <- ggplot(plot_df, aes(x = group, y = curve_residual, fill = group)) +
#   geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
#   geom_boxplot(width = 0.5, outlier.shape = NA, alpha = 0.6, color = "lightgrey")  +
#   stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
#   stat_summary(fun = median, geom = "point", shape = 23, size = 2.5,
#                fill = "black", color = "black", stroke = 0.7) +
# 
#   # Add n labels
#   geom_text(
#     data = label_df,
#     aes(x = group, y = max(plot_df$curve_residual, na.rm = TRUE) + 0.2, label = n_label),
#     inherit.aes = FALSE,
#     size = 4
#   ) +
# 
#   # Add median labels
#   geom_text(
#     data = label_df,
#     aes(x = group, y = median_val + 0.5, label = sprintf("%.2f", median_val)),
#     inherit.aes = FALSE,
#     size = 6
#   ) +
# 
#   labs(
#     x = "",
#     y = "Residual to exponential curve",
#     title = "Non-orthosteric Variants",
#     subtitle = "With negative activity-abundance residuals"
#   ) +
#   theme_classic(base_size = 14) +
#   scale_fill_manual(values = c(
#     "Pathogenic" = "indianred3",
#     "Non-pathogenic" = "tan3"
#   )) +
#   theme(legend.position = "none") +
# 
#   # Wilcoxon significance
#   geom_signif(comparisons = list(c("Pathogenic", "Non-pathogenic")),
#               map_signif_level = FALSE,
#               test = "wilcox.test",
#               tip_length = 0.01)
# 
# # Show plot
# p_violin
# merged_df_neg_patho_ortho <- merged_df_neg %>% filter(pathogenic_status == "Pathogenic") %>%
#   filter(site_type != "Non-orthosteric site")
# nrow(merged_df_neg_patho_ortho) #57
# merged_df_neg_non_patho_ortho <- merged_df_neg %>% filter(pathogenic_status != "Pathogenic") %>%
#   filter(site_type != "Non-orthosteric site")
# nrow(merged_df_neg_non_patho_ortho) #657
# wilcox.test(merged_df_neg_patho_ortho$curve_residual, merged_df_neg_non_patho_ortho$curve_residual)
# # 1. Combine data
# plot_df2 <- bind_rows(
#   merged_df_neg_patho_ortho %>%
#     mutate(group = "Pathogenic"),
#   merged_df_neg_non_patho_ortho %>%
#     mutate(group = "Non-pathogenic")
# )
# 
# # 2. Summary stats for labeling
# label_df2 <- plot_df2 %>%
#   group_by(group) %>%
#   summarise(
#     n_label = paste0("n = ", n()),
#     median_val = median(curve_residual, na.rm = TRUE)
#   )
# plot_df2$group <- factor(plot_df2$group, levels = c("Pathogenic", "Non-pathogenic"))
# 
# # 3. Plot
# p_violin2 <- ggplot(plot_df2, aes(x = group, y = curve_residual, fill = group)) +
#   geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
#   geom_boxplot(width = 0.5, outlier.shape = NA, alpha = 0.6, color = "lightgrey")  +
#   stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
#   stat_summary(fun = median, geom = "point", shape = 23, size = 2.5,
#                fill = "black", color = "black", stroke = 0.7) +
# 
#   # Add n labels
#   geom_text(
#     data = label_df2,
#     aes(x = group, y = max(plot_df2$curve_residual, na.rm = TRUE) + 0.2, label = n_label),
#     inherit.aes = FALSE,
#     size = 4
#   ) +
# 
#   # Add median labels
#   geom_text(
#     data = label_df2,
#     aes(x = group, y = median_val + 0.5, label = sprintf("%.2f", median_val)),
#     inherit.aes = FALSE,
#     size = 6
#   ) +
# 
#   labs(
#     x = "",
#     y = "Residual to exponential curve",
#     title = "PTEN: Orthosteric Variants",
#     subtitle = "With negative activity-abundance residuals"
#   ) +
#   theme_classic(base_size = 14) +
#   scale_fill_manual(values = c(
#     "Pathogenic" = "indianred3",
#     "Non-pathogenic" = "tan3"
#   )) +
#   theme(legend.position = "none") +
# 
#   # Wilcoxon significance
#   geom_signif(comparisons = list(c("Pathogenic", "Non-pathogenic")),
#               map_signif_level = FALSE,
#               test = "wilcox.test",
#               tip_length = 0.01)
# 
# # Show plot
# p_violin2
# plot <- plot_grid(p_violin2, p_violin, ncol = 1, nrow =2)
# ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig4_violin6.pdf", 
#        plot = plot, width = 3, height = 6, dpi = 300)
# plot
# merged_df_neg_patho <- merged_df_neg %>%
#   filter(
#     pathogenic_status == "Pathogenic" |
#     mutant %in% (pten_somatic$mutant)
#   )
# nrow(merged_df_neg_patho)
# table(merged_df_neg_patho$sig_above_curve)
# 
# merged_df_neg_patho_ortho <- merged_df_neg %>%
#   filter(
#     pathogenic_status == "Pathogenic" |
#     mutant %in% (pten_somatic$mutant)
#   )  %>% filter(site_type != "Non-orthosteric site")
# table(merged_df_neg_patho_ortho$sig_above_curve)
# 
# merged_df_neg_patho_nonortho <- merged_df_neg %>%
#   filter(
#     pathogenic_status == "Pathogenic" |
#     mutant %in% (pten_somatic$mutant)
#   ) %>% filter(site_type == "Non-orthosteric site")
# table(merged_df_neg_patho_nonortho$sig_above_curve)
# plot_df <- merged_df_neg %>%
#   filter(
#     pathogenic_status == "Pathogenic" |
#     mutant %in% (pten_somatic$mutant)
#   ) %>%
#   mutate(
#     group = ifelse(site_type == "Non-orthosteric site", "Non-orthosteric", "Orthosteric"),
#     group = factor(group, levels = c("Orthosteric", "Non-orthosteric"))
#   ) %>%
#   group_by(group, sig_above_curve) %>%
#   summarise(count = n(), .groups = "drop") %>%
#   group_by(group) %>%
#   mutate(percentage = 100 * count / sum(count))
# 
# # Plot
# p2 <- ggplot(plot_df, aes(x = group, y = percentage, fill = sig_above_curve)) +
#   geom_col(width = 0.6) +
#   geom_text(
#     aes(label = sprintf("%d (%.0f%%)", count, percentage)),
#     position = position_stack(vjust = 0.5),
#     size = 4
#   ) +
#   scale_y_continuous(limits = c(0, 100), expand = c(0, 0)) +
#   scale_fill_manual(
#     values = c("TRUE" = "indianred3", "FALSE" = "grey80"),
#     labels = c("Not above the exponential curve", "Above the exponential curve")
#   ) +
#   labs(
#     x = "",
#     y = "Variant percentage (%)",
#     title = "PTEN: 114 ClinVar Pathogenic/Somatic Vatiants",
#     subtitle = "With negative activity-abundance residuals",
#     fill = ""
#   ) +
#   theme_classic(base_size = 14) + theme(legend.position = "bottom")
# ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig4_boxplot1.pdf", 
#        plot = p2, width = 4, height = 5, dpi = 300)
# p2
# merged_df_neg_non_patho <- merged_df_neg %>%
#   filter(
#     pathogenic_status != "Pathogenic" &
#     !mutant %in% na.omit(pten_somatic$mutant)
#   )
# 
# nrow(merged_df_neg_non_patho)
# table(merged_df_neg_non_patho$sig_above_curve)
# 
# merged_df_neg_non_patho_ortho <- merged_df_neg %>%
#   filter(
#     pathogenic_status != "Pathogenic" &
#     !mutant %in% na.omit(pten_somatic$mutant)
#   ) %>% filter(site_type != "Non-orthosteric site")
# table(merged_df_neg_non_patho_ortho$sig_above_curve)
# 
# merged_df_neg_non_patho_nonortho <- merged_df_neg %>%
#   filter(
#     pathogenic_status != "Pathogenic" &
#     !mutant %in% na.omit(pten_somatic$mutant)
#   )%>% filter(site_type == "Non-orthosteric site")
# table(merged_df_neg_non_patho_nonortho$sig_above_curve)
# plot_df <- merged_df_neg %>%
#   filter(
#     pathogenic_status != "Pathogenic" &
#     !mutant %in% na.omit(pten_somatic$mutant)
#   ) %>%
#   mutate(
#     group = ifelse(site_type == "Non-orthosteric site", "Non-orthosteric", "Orthosteric"),
#     group = factor(group, levels = c("Orthosteric", "Non-orthosteric"))
#   ) %>%
#   group_by(group, sig_above_curve) %>%
#   summarise(count = n(), .groups = "drop") %>%
#   group_by(group) %>%
#   mutate(percentage = 100 * count / sum(count))
# 
# # Plot
# p3 <- ggplot(plot_df, aes(x = group, y = percentage, fill = sig_above_curve)) +
#   geom_col(width = 0.6) +
#   geom_text(
#     aes(label = sprintf("%d (%.0f%%)", count, percentage)),
#     position = position_stack(vjust = 0.5),
#     size = 4
#   ) +
#   scale_y_continuous(limits = c(0, 100), expand = c(0, 0)) +
#   scale_fill_manual(
#     values = c("TRUE" = "indianred3", "FALSE" = "grey80"),
#     labels = c("Not above the exponential curve", "Above the exponential curve")
#   ) +
#   labs(
#     x = "",
#     y = "Variant percentage (%)",
#     title = "PTEN: 1889 Non-pathogenic/somatic Vatiants",
#     subtitle = "With negative activity-abundance residuals",
#     fill = ""
#   ) +
#   theme_classic(base_size = 14) + theme(legend.position = "bottom")
# ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig4_boxplot2.pdf", 
#        plot = p3, width = 4, height = 5, dpi = 300)
# p3
lm_model <- lm(log(abs(residuals)) ~ min_dist_to_ligand, data = merged_df)
summary(lm_model)
## 
## Call:
## lm(formula = log(abs(residuals)) ~ min_dist_to_ligand, data = merged_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.8770 -0.6436  0.1605  0.9197  2.3642 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         0.134341   0.057172    2.35   0.0188 *  
## min_dist_to_ligand -0.035253   0.002717  -12.97   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.238 on 3736 degrees of freedom
## Multiple R-squared:  0.04311,    Adjusted R-squared:  0.04285 
## F-statistic: 168.3 on 1 and 3736 DF,  p-value: < 2.2e-16
# Call:
# lm(formula = log(abs(residuals)) ~ min_dist_to_ligand, data = merged_df)
# 
# Residuals:
#     Min      1Q  Median      3Q     Max 
# -7.3797 -0.6332  0.3234  0.9096  2.2610 
# 
# Coefficients:
#                     Estimate Std. Error t value Pr(>|t|)    
# (Intercept)         0.662759   0.074530   8.892   <2e-16 ***
# min_dist_to_ligand -0.051687   0.003745 -13.804   <2e-16 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# Residual standard error: 1.268 on 2001 degrees of freedom
# Multiple R-squared:  0.08694, Adjusted R-squared:  0.08649 
# F-statistic: 190.5 on 1 and 2001 DF,  p-value: < 2.2e-16
#merged_df <- merge(pten_df_clean, fil_protein_ca, by.x="mutation_position", by.y = "resno", all.x = TRUE)
#merged_df <- merged_df %>% dplyr::select(-sequence)

#merged_df <- merged_df %>% filter(!is.na(min_dist_to_ligand))
merged_df <- pten_df_clean
merged_df$pathogenic_status <- ifelse(
  merged_df$clinvar_clinical_significance %in% c("pathogenic",
                                           "likely_pathogenic"),
  "Pathogenic", "Other"
)
nrow(merged_df)
## [1] 4839
merged_df_int <- merged_df %>% filter(mutation_position %in% all_positions)
nrow(merged_df_int) #4839
## [1] 1385
table(merged_df_int$pathogenic_status)
## 
##      Other Pathogenic 
##       1325         60
 # Other Pathogenic 
 #      1325         60 

merged_df_out <- merged_df %>% filter(!mutation_position %in% all_positions)
nrow(merged_df_out) #1385
## [1] 3454
table(merged_df_out$pathogenic_status)
## 
##      Other Pathogenic 
##       3399         55
     # Other Pathogenic 
     #  3399         55
# 1. Set up
set.seed(123)
n_boot <- 1000
match_window <- 0.5
n_patho <- nrow(merged_df_int %>% filter(pathogenic_status == "Pathogenic"))

# 2. Get abundance/residuals from pten_patho
patho_df <- merged_df_int %>% filter(pathogenic_status == "Pathogenic") %>% dplyr::select(DMS_score_abundance, residuals)
patho_df$group <- "Pathogenic"

# 3. Bootstrap sampling from non-patho pool with abundance matching
non_patho_pool <- merged_df_int %>% filter(pathogenic_status != "Pathogenic") %>%
  filter(!(mutant %in% pten_patho$mutant)) %>%
  filter(!(mutant %in% pten_somatic$mutant)) %>% dplyr::select(DMS_score_abundance, residuals)

bootstrap_medians <- vector("numeric", length = n_boot)

# Pre-group non-patho pool into bins
non_patho_pool <- non_patho_pool %>%
  mutate(bin = cut(DMS_score_abundance, breaks = seq(-0.5, 2, by = match_window)))

# Bin pathogenic variants accordingly
patho_df <- patho_df %>%
  mutate(bin = cut(DMS_score_abundance, breaks = seq(-0.5, 2, by = match_window)))

# Create lookup table for fast sampling
bin_lookup <- split(non_patho_pool$residuals, non_patho_pool$bin)

# Bootstrap matrix
bootstrap_matrix <- matrix(NA, nrow = n_boot, ncol = n_patho)

for (i in 1:n_boot) {
  for (j in 1:n_patho) {
    bin_j <- patho_df$bin[j]
    candidates <- bin_lookup[[as.character(bin_j)]]
    if (!is.null(candidates) && length(candidates) > 0) {
      bootstrap_matrix[i, j] <- sample(candidates, 1)
    }
  }
}

# Summarize into a dataframe
boot_df <- data.frame(
  group = "Random abundance-matched",
  residuals = apply(bootstrap_matrix, 1, median, na.rm = TRUE)   # Mean across matches per bootstrap
)

# Combine with patho residuals
plot_df <- bind_rows(
  patho_df %>% dplyr::select(group, residuals),
  boot_df
)

label_df <- plot_df %>%
  group_by(group) %>%
  summarise(
    n = n(),
    median_val = median(residuals),
    y_max = max(residuals),
    .groups = "drop"
  )

label_df <- label_df %>%
  mutate(n_label = case_when(
    group == "Random abundance-matched" ~ "bootstrapped 1000 times",
    TRUE ~ paste0("n = ", n)
  ))

# Plot
p_fast <- ggplot(plot_df, aes(x = group, y = residuals, fill = group)) +
  geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
  geom_jitter(width = 0.15, size = 2, alpha = 0.7, color = "lightgrey") +
    stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
  stat_summary(fun = median, geom = "point", shape = 23, size = 2.5,
               fill = "black", color = "black", stroke = 0.7) +

 geom_text(
  data = label_df,
  aes(x = group, y = 4.5, label = n_label),
  inherit.aes = FALSE,
  size = 4) +

  geom_text(
    data = label_df,
    aes(x = group, y = median_val + 0.25, label = sprintf(" %.2f", median_val)),
    inherit.aes = FALSE,
    size = 6
  ) +
  labs(
    x = "",
    y = "Activity-abundance residuals",
    title = "Orthosteric variants"
  ) +
  theme_classic(base_size = 14) +
  scale_fill_manual(values = c("Pathogenic" = "cornflowerblue", "Random abundance-matched" = "darkolivegreen3")) +
  theme(legend.position = "none") +
  geom_signif(comparisons = list(c("Pathogenic", "Random abundance-matched")),
              map_signif_level = FALSE,
              test = "wilcox.test",
              tip_length = 0.01)


p_fast

# 1. Set up
set.seed(123)
n_boot <- 1000
match_window <- 0.5
n_patho <- nrow(merged_df_out %>% filter(pathogenic_status == "Pathogenic"))

# 2. Get abundance/residuals from pten_patho
patho_df <- merged_df_out %>% filter(pathogenic_status == "Pathogenic") %>% dplyr::select(DMS_score_abundance, residuals)
patho_df$group <- "Pathogenic"

# 3. Bootstrap sampling from non-patho pool with abundance matching
non_patho_pool <- merged_df_out %>% filter(pathogenic_status != "Pathogenic") %>%
  filter(!(mutant %in% pten_patho$mutant)) %>%
  filter(!(mutant %in% pten_somatic$mutant)) %>% dplyr::select(DMS_score_abundance, residuals)

bootstrap_medians <- vector("numeric", length = n_boot)

# Pre-group non-patho pool into bins
non_patho_pool <- non_patho_pool %>%
  mutate(bin = cut(DMS_score_abundance, breaks = seq(-0.5, 2, by = match_window)))

# Bin pathogenic variants accordingly
patho_df <- patho_df %>%
  mutate(bin = cut(DMS_score_abundance, breaks = seq(-0.5, 2, by = match_window)))

# Create lookup table for fast sampling
bin_lookup <- split(non_patho_pool$residuals, non_patho_pool$bin)

# Bootstrap matrix
bootstrap_matrix <- matrix(NA, nrow = n_boot, ncol = n_patho)

for (i in 1:n_boot) {
  for (j in 1:n_patho) {
    bin_j <- patho_df$bin[j]
    candidates <- bin_lookup[[as.character(bin_j)]]
    if (!is.null(candidates) && length(candidates) > 0) {
      bootstrap_matrix[i, j] <- sample(candidates, 1)
    }
  }
}

# Summarize into a dataframe
boot_df <- data.frame(
  group = "Random abundance-matched",
  residuals = apply(bootstrap_matrix, 1, median, na.rm = TRUE)   # Mean across matches per bootstrap
)

# Combine with patho residuals
plot_df <- bind_rows(
  patho_df %>% dplyr::select(group, residuals),
  boot_df
)

label_df <- plot_df %>%
  group_by(group) %>%
  summarise(
    n = n(),
    median_val = median(residuals),
    y_max = max(residuals),
    .groups = "drop"
  )

label_df <- label_df %>%
  mutate(n_label = case_when(
    group == "Random abundance-matched" ~ "bootstrapped 1000 times",
    TRUE ~ paste0("n = ", n)
  ))

# Plot
p_fast2 <- ggplot(plot_df, aes(x = group, y = residuals, fill = group)) +
  geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
  geom_jitter(width = 0.15, size = 2, alpha = 0.7, color = "lightgrey") +
    stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
  stat_summary(fun = median, geom = "point", shape = 23, size = 2.5,
               fill = "black", color = "black", stroke = 0.7) +

 geom_text(
  data = label_df,
  aes(x = group, y = 4.5, label = n_label),
  inherit.aes = FALSE,
  size = 4) +

  geom_text(
    data = label_df,
    aes(x = group, y = median_val + 0.25, label = sprintf(" %.2f", median_val)),
    inherit.aes = FALSE,
    size = 6
  ) +
  labs(
    x = "",
    y = "Activity-abundance residual",
    title = "Non-orthosteric variants"
  ) +
  theme_classic(base_size = 14) +
  scale_fill_manual(values = c("Pathogenic" = "cornflowerblue", "Random abundance-matched" = "darkolivegreen3")) +
  theme(legend.position = "none") +
  geom_signif(comparisons = list(c("Pathogenic", "Random abundance-matched")),
              map_signif_level = FALSE,
              test = "wilcox.test",
              tip_length = 0.01)


p_fast2

fig4G <- plot_grid(p_fast, p_fast2, nrow = 2, ncol=1)
ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig4_violin5.pdf", 
       plot = fig4G, width = 3, height = 6, dpi = 300)
fig4G